tgedr-dataops 0.0.37__tar.gz → 1.0.1__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- tgedr_dataops-1.0.1/PKG-INFO +72 -0
- tgedr_dataops-1.0.1/README.md +56 -0
- tgedr_dataops-1.0.1/pyproject.toml +135 -0
- {tgedr_dataops-0.0.37/src/tgedr/dataops → tgedr_dataops-1.0.1/src/tgedr_dataops}/commons/s3_connector.py +32 -5
- tgedr_dataops-1.0.1/src/tgedr_dataops/commons/utils_fs.py +187 -0
- tgedr_dataops-1.0.1/src/tgedr_dataops/quality/pandas_validation.py +21 -0
- tgedr_dataops-1.0.1/src/tgedr_dataops/sink/local_fs_file_sink.py +77 -0
- {tgedr_dataops-0.0.37/src/tgedr/dataops → tgedr_dataops-1.0.1/src/tgedr_dataops}/sink/s3_file_sink.py +47 -11
- tgedr_dataops-1.0.1/src/tgedr_dataops/source/abstract_s3_file_source.py +72 -0
- tgedr_dataops-1.0.1/src/tgedr_dataops/source/local_fs_file_source.py +108 -0
- tgedr_dataops-1.0.1/src/tgedr_dataops/source/pd_df_s3_source.py +130 -0
- {tgedr_dataops-0.0.37/src/tgedr/dataops → tgedr_dataops-1.0.1/src/tgedr_dataops}/source/s3_file_copy.py +64 -28
- tgedr_dataops-1.0.1/src/tgedr_dataops/source/s3_file_extended_source.py +68 -0
- {tgedr_dataops-0.0.37/src/tgedr/dataops → tgedr_dataops-1.0.1/src/tgedr_dataops}/source/s3_file_source.py +63 -27
- tgedr_dataops-1.0.1/src/tgedr_dataops/store/fs_single_partition_parquet.py +331 -0
- tgedr_dataops-1.0.1/src/tgedr_dataops/store/local_fs_single_partition_parquet.py +56 -0
- tgedr_dataops-1.0.1/src/tgedr_dataops/store/s3_single_partition_parquet.py +193 -0
- tgedr_dataops-1.0.1/src/tgedr_dataops.egg-info/PKG-INFO +72 -0
- tgedr_dataops-1.0.1/src/tgedr_dataops.egg-info/SOURCES.txt +25 -0
- tgedr_dataops-1.0.1/src/tgedr_dataops.egg-info/requires.txt +6 -0
- tgedr_dataops-1.0.1/src/tgedr_dataops.egg-info/top_level.txt +1 -0
- tgedr_dataops-0.0.37/PKG-INFO +0 -20
- tgedr_dataops-0.0.37/README.md +0 -54
- tgedr_dataops-0.0.37/pyproject.toml +0 -15
- tgedr_dataops-0.0.37/setup.py +0 -36
- tgedr_dataops-0.0.37/src/tgedr/dataops/chain.py +0 -51
- tgedr_dataops-0.0.37/src/tgedr/dataops/commons/dataset.py +0 -23
- tgedr_dataops-0.0.37/src/tgedr/dataops/commons/metadata.py +0 -172
- tgedr_dataops-0.0.37/src/tgedr/dataops/commons/utils_fs.py +0 -85
- tgedr_dataops-0.0.37/src/tgedr/dataops/commons/utils_spark.py +0 -87
- tgedr_dataops-0.0.37/src/tgedr/dataops/etl.py +0 -112
- tgedr_dataops-0.0.37/src/tgedr/dataops/processor.py +0 -27
- tgedr_dataops-0.0.37/src/tgedr/dataops/sink/local_fs_file_sink.py +0 -47
- tgedr_dataops-0.0.37/src/tgedr/dataops/sink/sink.py +0 -46
- tgedr_dataops-0.0.37/src/tgedr/dataops/source/abstract_s3_file_source.py +0 -43
- tgedr_dataops-0.0.37/src/tgedr/dataops/source/delta_table_source.py +0 -49
- tgedr_dataops-0.0.37/src/tgedr/dataops/source/local_delta_table.py +0 -47
- tgedr_dataops-0.0.37/src/tgedr/dataops/source/local_fs_file_source.py +0 -71
- tgedr_dataops-0.0.37/src/tgedr/dataops/source/pd_df_s3_source.py +0 -76
- tgedr_dataops-0.0.37/src/tgedr/dataops/source/s3_delta_table.py +0 -75
- tgedr_dataops-0.0.37/src/tgedr/dataops/source/s3_file_extended_source.py +0 -39
- tgedr_dataops-0.0.37/src/tgedr/dataops/source/source.py +0 -51
- tgedr_dataops-0.0.37/src/tgedr/dataops/store/fs_single_partition_parquet.py +0 -231
- tgedr_dataops-0.0.37/src/tgedr/dataops/store/local_fs_single_partition_parquet.py +0 -24
- tgedr_dataops-0.0.37/src/tgedr/dataops/store/s3_single_partition_parquet.py +0 -102
- tgedr_dataops-0.0.37/src/tgedr/dataops/store/spark_delta.py +0 -369
- tgedr_dataops-0.0.37/src/tgedr/dataops/store/store.py +0 -49
- tgedr_dataops-0.0.37/src/tgedr/dataops/utils_reflection.py +0 -134
- tgedr_dataops-0.0.37/src/tgedr/dataops/validation/abs.py +0 -46
- tgedr_dataops-0.0.37/src/tgedr/dataops/validation/pandas.py +0 -10
- tgedr_dataops-0.0.37/src/tgedr/dataops/validation/pyspark.py +0 -10
- tgedr_dataops-0.0.37/src/tgedr_dataops.egg-info/PKG-INFO +0 -20
- tgedr_dataops-0.0.37/src/tgedr_dataops.egg-info/SOURCES.txt +0 -42
- tgedr_dataops-0.0.37/src/tgedr_dataops.egg-info/requires.txt +0 -7
- tgedr_dataops-0.0.37/src/tgedr_dataops.egg-info/top_level.txt +0 -1
- {tgedr_dataops-0.0.37 → tgedr_dataops-1.0.1}/LICENSE +0 -0
- {tgedr_dataops-0.0.37 → tgedr_dataops-1.0.1}/setup.cfg +0 -0
- {tgedr_dataops-0.0.37/src/tgedr/dataops → tgedr_dataops-1.0.1/src/tgedr_dataops}/__init__.py +0 -0
- {tgedr_dataops-0.0.37/src/tgedr/dataops → tgedr_dataops-1.0.1/src/tgedr_dataops}/sink/__init__.py +0 -0
- {tgedr_dataops-0.0.37/src/tgedr/dataops → tgedr_dataops-1.0.1/src/tgedr_dataops}/source/__init__.py +0 -0
- {tgedr_dataops-0.0.37 → tgedr_dataops-1.0.1}/src/tgedr_dataops.egg-info/dependency_links.txt +0 -0
|
@@ -0,0 +1,72 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: tgedr-dataops
|
|
3
|
+
Version: 1.0.1
|
|
4
|
+
Summary: data operations related code
|
|
5
|
+
Author-email: joao tiago viegas <3536754+jtviegas@users.noreply.github.com>
|
|
6
|
+
Requires-Python: >=3.11
|
|
7
|
+
Description-Content-Type: text/markdown
|
|
8
|
+
License-File: LICENSE
|
|
9
|
+
Requires-Dist: tgedr-dataops-abs==0.0.3
|
|
10
|
+
Requires-Dist: s3fs==2024.5.0
|
|
11
|
+
Requires-Dist: boto3==1.34.106
|
|
12
|
+
Requires-Dist: openpyxl==3.1.2
|
|
13
|
+
Requires-Dist: pyarrow>=23.0.0
|
|
14
|
+
Requires-Dist: moto>=5.1.20
|
|
15
|
+
Dynamic: license-file
|
|
16
|
+
|
|
17
|
+
# data-ops
|
|
18
|
+
|
|
19
|
+

|
|
20
|
+
[](https://pypi.org/project/tgedr-dataops/)
|
|
21
|
+
|
|
22
|
+
|
|
23
|
+
|
|
24
|
+
data operations related code
|
|
25
|
+
|
|
26
|
+
## motivation
|
|
27
|
+
*data-ops* is a library with tested and used code aligning on some standards regarding code structure and quality and to avoid reinventing the wheel
|
|
28
|
+
|
|
29
|
+
## installation
|
|
30
|
+
`pip install tgedr-dataops`
|
|
31
|
+
|
|
32
|
+
## package namespaces and its contents
|
|
33
|
+
|
|
34
|
+
#### commons
|
|
35
|
+
- __S3Connector__: base class to be extended, providing a connection session with aws s3 resources
|
|
36
|
+
- __utils_fs__: utility module with file system related functions ([example](tests/tgedr_dataops/commons/test_utils_fs.py))
|
|
37
|
+
|
|
38
|
+
#### quality
|
|
39
|
+
- __PandasValidation__ : __GreatExpectationsValidation__ implementation to validate pandas dataframes with Great Expectations library ([example](tests/tgedr_dataops/quality/test_pandas_validation.py))
|
|
40
|
+
|
|
41
|
+
|
|
42
|
+
#### sink
|
|
43
|
+
- __LocalFsFileSink__: __Sink__ implementation class used to save/persist an object/file to a local fs location ([example](tests/tgedr_dataops/sink/test_localfs_file_sink.py))
|
|
44
|
+
- __S3FileSink__: __Sink__ implementation class used to save/persist a local object/file to an s3 bucket ([example](tests/tgedr_dataops/sink/test_s3_file_sink.py))
|
|
45
|
+
|
|
46
|
+
#### source
|
|
47
|
+
- __AbstractS3FileSource__: abstract __Source__ class used to retrieve objects/files from s3 bucket to local fs location circumventing some formats download limitation
|
|
48
|
+
- __LocalFsFileSource__: __Source__ implementation class used to retrieve local objects/files to another local fs location ([example](tests/tgedr_dataops/source/test_localfs_file_source.py))
|
|
49
|
+
- __PdDfS3Source__: __Source__ implementation class used to read a pandas dataframe from s3, whether a csv or an excel (xslx) file ([example csv](tests/tgedr_dataops/source/test_pd_df_s3_source_csv.py), [example excel](tests/tgedr_dataops/source/test_pd_df_s3_source_excel.py))
|
|
50
|
+
- __S3FileCopy__: __Source__ implementation class used to copy objects/files from an s3 bucket to another s3 bucket ([example](tests/tgedr_dataops/source/test_s3_copy.py))
|
|
51
|
+
- __S3FileExtendedSource__: __Source__ implementation class used to retrieve objects/files from s3 bucket to local fs location with the extra method `get_metadata` providing sile metadata ("LastModified", "ContentLength", "ETag", "VersionId", "ContentType")([example](tests/tgedr_dataops/source/test_s3_file_extended_source.py))
|
|
52
|
+
- __S3FileSource__: __Source__ implementation class used to retrieve objects/files from s3 bucket to local fs location ([example](tests/tgedr_dataops/source/test_s3_file_source.py))
|
|
53
|
+
|
|
54
|
+
#### store
|
|
55
|
+
- __FsSinglePartitionParquetStore__ : abstract __Store__ implementation defining persistence on parquet files with an optional single partition, regardless of the location it should persist
|
|
56
|
+
- __LocalFsSinglePartitionParquetStore__ : __FsSinglePartitionParquetStore__ implementation using local file system ([example](tests/tgedr_dataops/store/test_local_fs_single_partition_parquet.py))
|
|
57
|
+
- __S3FsSinglePartitionParquetStore__ : __FsSinglePartitionParquetStore__ implementation using aws s3 file system ([example](tests/tgedr_dataops/store/MANUAL_test_s3_single_partition_parquet.py))
|
|
58
|
+
|
|
59
|
+
|
|
60
|
+
## development
|
|
61
|
+
- main requirements:
|
|
62
|
+
- _uv_
|
|
63
|
+
- _bash_
|
|
64
|
+
- Clone the repository like this:
|
|
65
|
+
|
|
66
|
+
``` bash
|
|
67
|
+
git clone git@github.com:tgedr/pycommons
|
|
68
|
+
```
|
|
69
|
+
- cd into the folder: `cd pycommons`
|
|
70
|
+
- install requirements: `./helper.sh reqs`
|
|
71
|
+
|
|
72
|
+
|
|
@@ -0,0 +1,56 @@
|
|
|
1
|
+
# data-ops
|
|
2
|
+
|
|
3
|
+

|
|
4
|
+
[](https://pypi.org/project/tgedr-dataops/)
|
|
5
|
+
|
|
6
|
+
|
|
7
|
+
|
|
8
|
+
data operations related code
|
|
9
|
+
|
|
10
|
+
## motivation
|
|
11
|
+
*data-ops* is a library with tested and used code aligning on some standards regarding code structure and quality and to avoid reinventing the wheel
|
|
12
|
+
|
|
13
|
+
## installation
|
|
14
|
+
`pip install tgedr-dataops`
|
|
15
|
+
|
|
16
|
+
## package namespaces and its contents
|
|
17
|
+
|
|
18
|
+
#### commons
|
|
19
|
+
- __S3Connector__: base class to be extended, providing a connection session with aws s3 resources
|
|
20
|
+
- __utils_fs__: utility module with file system related functions ([example](tests/tgedr_dataops/commons/test_utils_fs.py))
|
|
21
|
+
|
|
22
|
+
#### quality
|
|
23
|
+
- __PandasValidation__ : __GreatExpectationsValidation__ implementation to validate pandas dataframes with Great Expectations library ([example](tests/tgedr_dataops/quality/test_pandas_validation.py))
|
|
24
|
+
|
|
25
|
+
|
|
26
|
+
#### sink
|
|
27
|
+
- __LocalFsFileSink__: __Sink__ implementation class used to save/persist an object/file to a local fs location ([example](tests/tgedr_dataops/sink/test_localfs_file_sink.py))
|
|
28
|
+
- __S3FileSink__: __Sink__ implementation class used to save/persist a local object/file to an s3 bucket ([example](tests/tgedr_dataops/sink/test_s3_file_sink.py))
|
|
29
|
+
|
|
30
|
+
#### source
|
|
31
|
+
- __AbstractS3FileSource__: abstract __Source__ class used to retrieve objects/files from s3 bucket to local fs location circumventing some formats download limitation
|
|
32
|
+
- __LocalFsFileSource__: __Source__ implementation class used to retrieve local objects/files to another local fs location ([example](tests/tgedr_dataops/source/test_localfs_file_source.py))
|
|
33
|
+
- __PdDfS3Source__: __Source__ implementation class used to read a pandas dataframe from s3, whether a csv or an excel (xslx) file ([example csv](tests/tgedr_dataops/source/test_pd_df_s3_source_csv.py), [example excel](tests/tgedr_dataops/source/test_pd_df_s3_source_excel.py))
|
|
34
|
+
- __S3FileCopy__: __Source__ implementation class used to copy objects/files from an s3 bucket to another s3 bucket ([example](tests/tgedr_dataops/source/test_s3_copy.py))
|
|
35
|
+
- __S3FileExtendedSource__: __Source__ implementation class used to retrieve objects/files from s3 bucket to local fs location with the extra method `get_metadata` providing sile metadata ("LastModified", "ContentLength", "ETag", "VersionId", "ContentType")([example](tests/tgedr_dataops/source/test_s3_file_extended_source.py))
|
|
36
|
+
- __S3FileSource__: __Source__ implementation class used to retrieve objects/files from s3 bucket to local fs location ([example](tests/tgedr_dataops/source/test_s3_file_source.py))
|
|
37
|
+
|
|
38
|
+
#### store
|
|
39
|
+
- __FsSinglePartitionParquetStore__ : abstract __Store__ implementation defining persistence on parquet files with an optional single partition, regardless of the location it should persist
|
|
40
|
+
- __LocalFsSinglePartitionParquetStore__ : __FsSinglePartitionParquetStore__ implementation using local file system ([example](tests/tgedr_dataops/store/test_local_fs_single_partition_parquet.py))
|
|
41
|
+
- __S3FsSinglePartitionParquetStore__ : __FsSinglePartitionParquetStore__ implementation using aws s3 file system ([example](tests/tgedr_dataops/store/MANUAL_test_s3_single_partition_parquet.py))
|
|
42
|
+
|
|
43
|
+
|
|
44
|
+
## development
|
|
45
|
+
- main requirements:
|
|
46
|
+
- _uv_
|
|
47
|
+
- _bash_
|
|
48
|
+
- Clone the repository like this:
|
|
49
|
+
|
|
50
|
+
``` bash
|
|
51
|
+
git clone git@github.com:tgedr/pycommons
|
|
52
|
+
```
|
|
53
|
+
- cd into the folder: `cd pycommons`
|
|
54
|
+
- install requirements: `./helper.sh reqs`
|
|
55
|
+
|
|
56
|
+
|
|
@@ -0,0 +1,135 @@
|
|
|
1
|
+
[project]
|
|
2
|
+
name = "tgedr-dataops"
|
|
3
|
+
version = "1.0.1"
|
|
4
|
+
description = "data operations related code"
|
|
5
|
+
authors = [
|
|
6
|
+
{name = "joao tiago viegas",email = "3536754+jtviegas@users.noreply.github.com"}
|
|
7
|
+
]
|
|
8
|
+
readme = "README.md"
|
|
9
|
+
requires-python = ">=3.11"
|
|
10
|
+
|
|
11
|
+
dependencies = [
|
|
12
|
+
"tgedr-dataops-abs==0.0.3",
|
|
13
|
+
"s3fs==2024.5.0",
|
|
14
|
+
"boto3==1.34.106",
|
|
15
|
+
"openpyxl==3.1.2",
|
|
16
|
+
"pyarrow>=23.0.0",
|
|
17
|
+
"moto>=5.1.20",
|
|
18
|
+
]
|
|
19
|
+
[dependency-groups]
|
|
20
|
+
dev = [
|
|
21
|
+
"pre-commit~=4.2.0",
|
|
22
|
+
"pytest~=8.3.5",
|
|
23
|
+
"pytest-bdd~=8.1.0",
|
|
24
|
+
"pytest-cov~=4.1.0",
|
|
25
|
+
"pytest-mock~=3.15.0",
|
|
26
|
+
"ruff==0.9.10",
|
|
27
|
+
"bandit==1.8.3",
|
|
28
|
+
"safety==3.5.1",
|
|
29
|
+
"typer<0.17.0",
|
|
30
|
+
"genbadge[coverage]>=1.1.3",
|
|
31
|
+
]
|
|
32
|
+
|
|
33
|
+
# [project.scripts]
|
|
34
|
+
# run = "tgedr.pycommons.entrypoint:entrypoint"
|
|
35
|
+
|
|
36
|
+
[build-system]
|
|
37
|
+
requires = ["setuptools>=78.1.0", "wheel>=0.45.1"]
|
|
38
|
+
build-backend = "setuptools.build_meta"
|
|
39
|
+
|
|
40
|
+
[tool.setuptools.packages.find]
|
|
41
|
+
where = ["src"]
|
|
42
|
+
|
|
43
|
+
[tool.setuptools]
|
|
44
|
+
include-package-data = true
|
|
45
|
+
|
|
46
|
+
[tool.setuptools.package-data]
|
|
47
|
+
"*" = ["CHANGELOG"]
|
|
48
|
+
|
|
49
|
+
[tool.coverage.paths]
|
|
50
|
+
source = ["src/"]
|
|
51
|
+
|
|
52
|
+
[tool.coverage.run]
|
|
53
|
+
source = ["src/"]
|
|
54
|
+
include = ["src/*"]
|
|
55
|
+
omit = [
|
|
56
|
+
"*/tests/*",
|
|
57
|
+
"*/test_*",
|
|
58
|
+
"*/__pycache__/*",
|
|
59
|
+
"*/migrations/*",
|
|
60
|
+
"*/venv/*",
|
|
61
|
+
"*/.venv/*"
|
|
62
|
+
]
|
|
63
|
+
|
|
64
|
+
[tool.coverage.report]
|
|
65
|
+
exclude_lines = [
|
|
66
|
+
"pragma: no cover",
|
|
67
|
+
"def __repr__",
|
|
68
|
+
"raise AssertionError",
|
|
69
|
+
"raise NotImplementedError",
|
|
70
|
+
"if __name__ == .__main__.:",
|
|
71
|
+
"if TYPE_CHECKING:",
|
|
72
|
+
]
|
|
73
|
+
show_missing = true
|
|
74
|
+
skip_covered = false
|
|
75
|
+
skip_empty = false
|
|
76
|
+
|
|
77
|
+
[tool.pytest.ini_options]
|
|
78
|
+
# bdd_features_base_dir = "documentation/features"
|
|
79
|
+
pythonpath = "."
|
|
80
|
+
|
|
81
|
+
[tool.ruff]
|
|
82
|
+
exclude = [
|
|
83
|
+
".bzr",
|
|
84
|
+
".direnv",
|
|
85
|
+
".eggs",
|
|
86
|
+
".git",
|
|
87
|
+
".git-rewrite",
|
|
88
|
+
".hg",
|
|
89
|
+
".ipynb_checkpoints",
|
|
90
|
+
".mypy_cache",
|
|
91
|
+
".nox",
|
|
92
|
+
".pants.d",
|
|
93
|
+
".pyenv",
|
|
94
|
+
".pytest_cache",
|
|
95
|
+
".pytype",
|
|
96
|
+
".ruff_cache",
|
|
97
|
+
".svn",
|
|
98
|
+
".tox",
|
|
99
|
+
".venv",
|
|
100
|
+
".vscode",
|
|
101
|
+
"__pypackages__",
|
|
102
|
+
"_build",
|
|
103
|
+
"buck-out",
|
|
104
|
+
"build",
|
|
105
|
+
"dist",
|
|
106
|
+
"node_modules",
|
|
107
|
+
"site-packages",
|
|
108
|
+
"venv",
|
|
109
|
+
"tests/",
|
|
110
|
+
"typings/"
|
|
111
|
+
]
|
|
112
|
+
|
|
113
|
+
line-length = 120
|
|
114
|
+
indent-width = 4
|
|
115
|
+
|
|
116
|
+
[tool.ruff.lint]
|
|
117
|
+
select = ["ALL"]
|
|
118
|
+
ignore = ["D203", "S101", "D104", "INP001", "D213", "COM812", "I001",
|
|
119
|
+
"D401", "D407", "RET504", "PLR2004", "FA102", "E501", "EXE002", "PLR0913",
|
|
120
|
+
"PLR0912", "C901", "PLR0911", "D413", "N818", "B024", "ANN401", "SIM300",
|
|
121
|
+
"FBT001", "FBT002", "G004", "TRY003", "EM102", "EM101", "PD015", "PD901"]
|
|
122
|
+
fixable = ["ALL"]
|
|
123
|
+
unfixable = []
|
|
124
|
+
# Allow unused variables when underscore-prefixed.
|
|
125
|
+
dummy-variable-rgx = "^(_+|(_+[a-zA-Z0-9_]*[a-zA-Z0-9]+?))$"
|
|
126
|
+
|
|
127
|
+
[tool.ruff.format]
|
|
128
|
+
# Like Black, use double quotes for strings.
|
|
129
|
+
quote-style = "double"
|
|
130
|
+
# Like Black, indent with spaces, rather than tabs.
|
|
131
|
+
indent-style = "space"
|
|
132
|
+
# Like Black, respect magic trailing commas.
|
|
133
|
+
skip-magic-trailing-comma = false
|
|
134
|
+
# Like Black, automatically detect the appropriate line ending.
|
|
135
|
+
line-ending = "auto"
|
|
@@ -1,17 +1,30 @@
|
|
|
1
|
+
"""AWS S3 connector module.
|
|
2
|
+
|
|
3
|
+
This module provides the S3Connector class for establishing and managing
|
|
4
|
+
connections to AWS S3 resources.
|
|
5
|
+
"""
|
|
1
6
|
import os
|
|
2
7
|
import boto3
|
|
3
8
|
|
|
4
9
|
|
|
5
10
|
class S3Connector:
|
|
6
|
-
"""utility base class to be extended, providing a connection session with aws s3 resources"""
|
|
11
|
+
"""utility base class to be extended, providing a connection session with aws s3 resources."""
|
|
7
12
|
|
|
8
|
-
def __init__(self):
|
|
13
|
+
def __init__(self) -> None:
|
|
14
|
+
"""Initialize the S3Connector with empty session, resource, and client attributes."""
|
|
9
15
|
self.__resource = None
|
|
10
16
|
self.__session = None
|
|
11
17
|
self.__client = None
|
|
12
18
|
|
|
13
19
|
@property
|
|
14
|
-
def _session(self)
|
|
20
|
+
def _session(self) -> boto3.Session:
|
|
21
|
+
"""Get or create a boto3 session.
|
|
22
|
+
|
|
23
|
+
Returns
|
|
24
|
+
-------
|
|
25
|
+
boto3.Session
|
|
26
|
+
The boto3 session instance, using credentials if configured.
|
|
27
|
+
"""
|
|
15
28
|
if self.__session is None:
|
|
16
29
|
if "1" == os.getenv("S3_CONNECTOR_USE_CREDENTIALS", default="0"):
|
|
17
30
|
self.__session = boto3.Session(
|
|
@@ -26,13 +39,27 @@ class S3Connector:
|
|
|
26
39
|
return self.__session
|
|
27
40
|
|
|
28
41
|
@property
|
|
29
|
-
def _resource(self):
|
|
42
|
+
def _resource(self) -> boto3.resources.base.ServiceResource:
|
|
43
|
+
"""Get or create an S3 resource.
|
|
44
|
+
|
|
45
|
+
Returns
|
|
46
|
+
-------
|
|
47
|
+
boto3.resources.base.ServiceResource
|
|
48
|
+
The boto3 S3 resource instance.
|
|
49
|
+
"""
|
|
30
50
|
if self.__resource is None:
|
|
31
51
|
self.__resource = self._session.resource("s3")
|
|
32
52
|
return self.__resource
|
|
33
53
|
|
|
34
54
|
@property
|
|
35
|
-
def _client(self):
|
|
55
|
+
def _client(self) -> boto3.client:
|
|
56
|
+
"""Get or create an S3 client.
|
|
57
|
+
|
|
58
|
+
Returns
|
|
59
|
+
-------
|
|
60
|
+
boto3.client
|
|
61
|
+
The boto3 S3 client instance.
|
|
62
|
+
"""
|
|
36
63
|
if self.__client is None:
|
|
37
64
|
self.__client = self._session.client("s3")
|
|
38
65
|
return self.__client
|
|
@@ -0,0 +1,187 @@
|
|
|
1
|
+
"""Filesystem utility functions for file and path operations.
|
|
2
|
+
|
|
3
|
+
This module provides utilities for:
|
|
4
|
+
- Creating temporary files and directories
|
|
5
|
+
- Parsing and processing S3 URLs and paths
|
|
6
|
+
- Extracting URL protocols
|
|
7
|
+
- Generating file hashes
|
|
8
|
+
"""
|
|
9
|
+
import tempfile
|
|
10
|
+
import re
|
|
11
|
+
import hashlib
|
|
12
|
+
from pathlib import Path
|
|
13
|
+
|
|
14
|
+
|
|
15
|
+
def temp_dir(root: str | None = None, suffix: str | None = None, prefix: str | None = None) -> str:
|
|
16
|
+
"""Create a temporary directory and return its path.
|
|
17
|
+
|
|
18
|
+
Parameters
|
|
19
|
+
----------
|
|
20
|
+
root : str | None
|
|
21
|
+
Directory where the temporary directory will be created.
|
|
22
|
+
suffix : str | None
|
|
23
|
+
Suffix for the temporary directory name.
|
|
24
|
+
prefix : str | None
|
|
25
|
+
Prefix for the temporary directory name.
|
|
26
|
+
|
|
27
|
+
Returns
|
|
28
|
+
-------
|
|
29
|
+
str
|
|
30
|
+
Path to the created temporary directory.
|
|
31
|
+
"""
|
|
32
|
+
return tempfile.mkdtemp(suffix=suffix, prefix=prefix, dir=root)
|
|
33
|
+
|
|
34
|
+
|
|
35
|
+
def temp_file(
|
|
36
|
+
root: str | None = None, suffix: str | None = None, prefix: str | None = None,
|
|
37
|
+
discard_handle: bool = True
|
|
38
|
+
) -> str | tuple[int, str]:
|
|
39
|
+
"""Create a temporary file and return its path or handle and path.
|
|
40
|
+
|
|
41
|
+
Parameters
|
|
42
|
+
----------
|
|
43
|
+
root : str | None
|
|
44
|
+
Directory where the temporary file will be created.
|
|
45
|
+
suffix : str | None
|
|
46
|
+
Suffix for the temporary file name.
|
|
47
|
+
prefix : str | None
|
|
48
|
+
Prefix for the temporary file name.
|
|
49
|
+
discard_handle : bool
|
|
50
|
+
If True, return only the file path. If False, return tuple of (handle, path).
|
|
51
|
+
|
|
52
|
+
Returns
|
|
53
|
+
-------
|
|
54
|
+
str | tuple[int, str]
|
|
55
|
+
File path if discard_handle is True, otherwise tuple of (file handle, file path).
|
|
56
|
+
"""
|
|
57
|
+
h, f = tempfile.mkstemp(suffix=suffix, prefix=prefix, dir=root)
|
|
58
|
+
if discard_handle:
|
|
59
|
+
return f
|
|
60
|
+
return (h, f)
|
|
61
|
+
|
|
62
|
+
|
|
63
|
+
def resolve_url_protocol(url: str) -> str:
|
|
64
|
+
"""Extract the protocol from a URL.
|
|
65
|
+
|
|
66
|
+
Parameters
|
|
67
|
+
----------
|
|
68
|
+
url : str
|
|
69
|
+
The URL to extract the protocol from.
|
|
70
|
+
|
|
71
|
+
Returns
|
|
72
|
+
-------
|
|
73
|
+
str
|
|
74
|
+
The protocol (e.g., 'http://', 'https://') or None if no protocol is found.
|
|
75
|
+
"""
|
|
76
|
+
result = None
|
|
77
|
+
group_match = re.search("(.*://).*", url)
|
|
78
|
+
if group_match is not None:
|
|
79
|
+
result = group_match.group(1)
|
|
80
|
+
return result
|
|
81
|
+
|
|
82
|
+
|
|
83
|
+
def resolve_s3_protocol(url: str) -> str:
|
|
84
|
+
"""Extract the S3 protocol from a URL.
|
|
85
|
+
|
|
86
|
+
Parameters
|
|
87
|
+
----------
|
|
88
|
+
url : str
|
|
89
|
+
The URL to extract the S3 protocol from.
|
|
90
|
+
|
|
91
|
+
Returns
|
|
92
|
+
-------
|
|
93
|
+
str
|
|
94
|
+
The S3 protocol (e.g., 's3://', 's3a://') or None if no S3 protocol is found.
|
|
95
|
+
"""
|
|
96
|
+
result = None
|
|
97
|
+
group_match = re.search("(s3[a]?://).*", url)
|
|
98
|
+
if group_match is not None:
|
|
99
|
+
result = group_match.group(1)
|
|
100
|
+
return result
|
|
101
|
+
|
|
102
|
+
|
|
103
|
+
def remove_s3_protocol(url: str) -> str:
|
|
104
|
+
"""Remove the S3 protocol prefix from a URL.
|
|
105
|
+
|
|
106
|
+
Parameters
|
|
107
|
+
----------
|
|
108
|
+
url : str
|
|
109
|
+
The S3 URL to remove the protocol from.
|
|
110
|
+
|
|
111
|
+
Returns
|
|
112
|
+
-------
|
|
113
|
+
str
|
|
114
|
+
The URL without the S3 protocol prefix (s3:// or s3a://).
|
|
115
|
+
"""
|
|
116
|
+
if url.startswith("s3://"):
|
|
117
|
+
result = url[5:]
|
|
118
|
+
elif url.startswith("s3a://"):
|
|
119
|
+
result = url[6:]
|
|
120
|
+
else:
|
|
121
|
+
result = url
|
|
122
|
+
return result
|
|
123
|
+
|
|
124
|
+
|
|
125
|
+
def process_s3_path(path: str) -> tuple[str, str]:
|
|
126
|
+
"""Extract bucket and key from an S3 path.
|
|
127
|
+
|
|
128
|
+
Parameters
|
|
129
|
+
----------
|
|
130
|
+
path : str
|
|
131
|
+
The S3 path to process (with or without protocol).
|
|
132
|
+
|
|
133
|
+
Returns
|
|
134
|
+
-------
|
|
135
|
+
tuple[str, str]
|
|
136
|
+
A tuple containing (bucket, key).
|
|
137
|
+
"""
|
|
138
|
+
no_protocol_path = remove_s3_protocol(path)
|
|
139
|
+
path_elements = no_protocol_path.split("/")
|
|
140
|
+
bucket = path_elements[0]
|
|
141
|
+
key = "/".join(path_elements[1:])
|
|
142
|
+
return (bucket, key)
|
|
143
|
+
|
|
144
|
+
|
|
145
|
+
def process_s3_url(url: str) -> tuple[str, str, str]:
|
|
146
|
+
"""Extract protocol, bucket, and key from an S3 URL.
|
|
147
|
+
|
|
148
|
+
Parameters
|
|
149
|
+
----------
|
|
150
|
+
url : str
|
|
151
|
+
The S3 URL to process.
|
|
152
|
+
|
|
153
|
+
Returns
|
|
154
|
+
-------
|
|
155
|
+
tuple[str, str, str]
|
|
156
|
+
A tuple containing (protocol, bucket, key).
|
|
157
|
+
"""
|
|
158
|
+
protocol = resolve_s3_protocol(url)
|
|
159
|
+
no_protocol_url = remove_s3_protocol(url)
|
|
160
|
+
path_elements = no_protocol_url.split("/")
|
|
161
|
+
bucket = path_elements[0]
|
|
162
|
+
key = "/".join(path_elements[1:])
|
|
163
|
+
return ("" if protocol is None else protocol, bucket, key)
|
|
164
|
+
|
|
165
|
+
|
|
166
|
+
def hash_file(filepath: str, hash_func=hashlib.sha256) -> str: # noqa: ANN001
|
|
167
|
+
"""Generate a hash for a file.
|
|
168
|
+
|
|
169
|
+
Args:
|
|
170
|
+
filepath (str): The path to the file.
|
|
171
|
+
hash_func: A hashlib hash function, e.g., hashlib.md5().
|
|
172
|
+
|
|
173
|
+
Returns:
|
|
174
|
+
str: The hexadecimal hash string of the file.
|
|
175
|
+
"""
|
|
176
|
+
# Initialize the hash object
|
|
177
|
+
hasher = hash_func()
|
|
178
|
+
|
|
179
|
+
# Open the file in binary read mode
|
|
180
|
+
with Path(filepath).open("rb") as file:
|
|
181
|
+
# Read the file in chunks to avoid using too much memory
|
|
182
|
+
chunk_size = 8192
|
|
183
|
+
while chunk := file.read(chunk_size):
|
|
184
|
+
hasher.update(chunk)
|
|
185
|
+
|
|
186
|
+
# Return the hexadecimal digest of the hash
|
|
187
|
+
return hasher.hexdigest()
|
|
@@ -0,0 +1,21 @@
|
|
|
1
|
+
"""Pandas DataFrame validation implementation module.
|
|
2
|
+
|
|
3
|
+
This module provides the Pandas-specific implementation of Great Expectations validation.
|
|
4
|
+
"""
|
|
5
|
+
from great_expectations.execution_engine import ExecutionEngine
|
|
6
|
+
from great_expectations.execution_engine import PandasExecutionEngine
|
|
7
|
+
from tgedr_dataops_abs.great_expectations_validation import GreatExpectationsValidation
|
|
8
|
+
|
|
9
|
+
|
|
10
|
+
class PandasValidation(GreatExpectationsValidation):
|
|
11
|
+
"""Pandas DataFrame validation implementation."""
|
|
12
|
+
|
|
13
|
+
def _get_execution_engine(self, batch_data_dict: dict) -> ExecutionEngine:
|
|
14
|
+
"""Get the execution engine used by the validation implementation.
|
|
15
|
+
|
|
16
|
+
Returns
|
|
17
|
+
-------
|
|
18
|
+
ExecutionEngine
|
|
19
|
+
The execution engine instance.
|
|
20
|
+
"""
|
|
21
|
+
return PandasExecutionEngine(batch_data_dict=batch_data_dict)
|
|
@@ -0,0 +1,77 @@
|
|
|
1
|
+
"""Local filesystem sink implementation for persisting files.
|
|
2
|
+
|
|
3
|
+
This module provides:
|
|
4
|
+
- LocalFsFileSink: sink class for saving/persisting files to local filesystem.
|
|
5
|
+
"""
|
|
6
|
+
|
|
7
|
+
import logging
|
|
8
|
+
import shutil
|
|
9
|
+
from pathlib import Path
|
|
10
|
+
from typing import Any
|
|
11
|
+
|
|
12
|
+
from tgedr_dataops_abs.sink import Sink, SinkException
|
|
13
|
+
|
|
14
|
+
|
|
15
|
+
logger = logging.getLogger(__name__)
|
|
16
|
+
|
|
17
|
+
|
|
18
|
+
class LocalFsFileSink(Sink):
|
|
19
|
+
"""sink class used to save/persist an object/file to a local fs location."""
|
|
20
|
+
|
|
21
|
+
CONTEXT_SOURCE_PATH = "source"
|
|
22
|
+
CONTEXT_TARGET_PATH = "target"
|
|
23
|
+
|
|
24
|
+
def put(self, context: dict[str, Any] | None = None) -> Any:
|
|
25
|
+
"""Copy a file from source to target on local filesystem.
|
|
26
|
+
|
|
27
|
+
Parameters
|
|
28
|
+
----------
|
|
29
|
+
context : dict[str, Any], optional
|
|
30
|
+
Context dictionary containing 'source' and 'target' paths.
|
|
31
|
+
|
|
32
|
+
Raises
|
|
33
|
+
------
|
|
34
|
+
SinkException
|
|
35
|
+
If source or target context is missing.
|
|
36
|
+
"""
|
|
37
|
+
logger.info(f"[put|in] ({context})")
|
|
38
|
+
|
|
39
|
+
if self.CONTEXT_SOURCE_PATH not in context:
|
|
40
|
+
raise SinkException(f"you must provide context for {self.CONTEXT_SOURCE_PATH}")
|
|
41
|
+
if self.CONTEXT_TARGET_PATH not in context:
|
|
42
|
+
raise SinkException(f"you must provide context for {self.CONTEXT_TARGET_PATH}")
|
|
43
|
+
|
|
44
|
+
source = context[self.CONTEXT_SOURCE_PATH]
|
|
45
|
+
target = context[self.CONTEXT_TARGET_PATH]
|
|
46
|
+
|
|
47
|
+
shutil.copy(source, target)
|
|
48
|
+
logger.info("[put|out]")
|
|
49
|
+
|
|
50
|
+
def delete(self, context: dict[str, Any] | None = None) -> None:
|
|
51
|
+
"""Delete a file or directory from local filesystem.
|
|
52
|
+
|
|
53
|
+
Parameters
|
|
54
|
+
----------
|
|
55
|
+
context : dict[str, Any], optional
|
|
56
|
+
Context dictionary containing 'target' path.
|
|
57
|
+
|
|
58
|
+
Raises
|
|
59
|
+
------
|
|
60
|
+
SinkException
|
|
61
|
+
If target context is missing or target is neither file nor directory.
|
|
62
|
+
"""
|
|
63
|
+
logger.info(f"[delete|in] ({context})")
|
|
64
|
+
|
|
65
|
+
if self.CONTEXT_TARGET_PATH not in context:
|
|
66
|
+
raise SinkException(f"you must provide context for {self.CONTEXT_TARGET_PATH}")
|
|
67
|
+
|
|
68
|
+
target = Path(context[self.CONTEXT_TARGET_PATH])
|
|
69
|
+
|
|
70
|
+
if target.is_file():
|
|
71
|
+
target.unlink()
|
|
72
|
+
elif target.is_dir():
|
|
73
|
+
shutil.rmtree(target)
|
|
74
|
+
else:
|
|
75
|
+
raise SinkException(f"[delete] is it a dir or a folder? {target}")
|
|
76
|
+
|
|
77
|
+
logger.info("[delete|out]")
|