tgedr-dataops 0.0.36__tar.gz → 1.0.1__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (60) hide show
  1. tgedr_dataops-1.0.1/PKG-INFO +72 -0
  2. tgedr_dataops-1.0.1/README.md +56 -0
  3. tgedr_dataops-1.0.1/pyproject.toml +135 -0
  4. {tgedr_dataops-0.0.36/src/tgedr/dataops → tgedr_dataops-1.0.1/src/tgedr_dataops}/commons/s3_connector.py +32 -5
  5. tgedr_dataops-1.0.1/src/tgedr_dataops/commons/utils_fs.py +187 -0
  6. tgedr_dataops-1.0.1/src/tgedr_dataops/quality/pandas_validation.py +21 -0
  7. tgedr_dataops-1.0.1/src/tgedr_dataops/sink/local_fs_file_sink.py +77 -0
  8. {tgedr_dataops-0.0.36/src/tgedr/dataops → tgedr_dataops-1.0.1/src/tgedr_dataops}/sink/s3_file_sink.py +47 -11
  9. tgedr_dataops-1.0.1/src/tgedr_dataops/source/abstract_s3_file_source.py +72 -0
  10. tgedr_dataops-1.0.1/src/tgedr_dataops/source/local_fs_file_source.py +108 -0
  11. tgedr_dataops-1.0.1/src/tgedr_dataops/source/pd_df_s3_source.py +130 -0
  12. {tgedr_dataops-0.0.36/src/tgedr/dataops → tgedr_dataops-1.0.1/src/tgedr_dataops}/source/s3_file_copy.py +64 -28
  13. tgedr_dataops-1.0.1/src/tgedr_dataops/source/s3_file_extended_source.py +68 -0
  14. {tgedr_dataops-0.0.36/src/tgedr/dataops → tgedr_dataops-1.0.1/src/tgedr_dataops}/source/s3_file_source.py +60 -39
  15. tgedr_dataops-1.0.1/src/tgedr_dataops/store/fs_single_partition_parquet.py +331 -0
  16. tgedr_dataops-1.0.1/src/tgedr_dataops/store/local_fs_single_partition_parquet.py +56 -0
  17. tgedr_dataops-1.0.1/src/tgedr_dataops/store/s3_single_partition_parquet.py +193 -0
  18. tgedr_dataops-1.0.1/src/tgedr_dataops.egg-info/PKG-INFO +72 -0
  19. tgedr_dataops-1.0.1/src/tgedr_dataops.egg-info/SOURCES.txt +25 -0
  20. tgedr_dataops-1.0.1/src/tgedr_dataops.egg-info/requires.txt +6 -0
  21. tgedr_dataops-1.0.1/src/tgedr_dataops.egg-info/top_level.txt +1 -0
  22. tgedr_dataops-0.0.36/PKG-INFO +0 -19
  23. tgedr_dataops-0.0.36/README.md +0 -53
  24. tgedr_dataops-0.0.36/pyproject.toml +0 -15
  25. tgedr_dataops-0.0.36/setup.py +0 -35
  26. tgedr_dataops-0.0.36/src/tgedr/dataops/chain.py +0 -51
  27. tgedr_dataops-0.0.36/src/tgedr/dataops/commons/dataset.py +0 -23
  28. tgedr_dataops-0.0.36/src/tgedr/dataops/commons/metadata.py +0 -172
  29. tgedr_dataops-0.0.36/src/tgedr/dataops/commons/utils_fs.py +0 -85
  30. tgedr_dataops-0.0.36/src/tgedr/dataops/commons/utils_spark.py +0 -87
  31. tgedr_dataops-0.0.36/src/tgedr/dataops/etl.py +0 -112
  32. tgedr_dataops-0.0.36/src/tgedr/dataops/processor.py +0 -27
  33. tgedr_dataops-0.0.36/src/tgedr/dataops/sink/local_fs_file_sink.py +0 -47
  34. tgedr_dataops-0.0.36/src/tgedr/dataops/sink/sink.py +0 -46
  35. tgedr_dataops-0.0.36/src/tgedr/dataops/source/abstract_s3_file_source.py +0 -43
  36. tgedr_dataops-0.0.36/src/tgedr/dataops/source/delta_table_source.py +0 -49
  37. tgedr_dataops-0.0.36/src/tgedr/dataops/source/local_delta_table.py +0 -47
  38. tgedr_dataops-0.0.36/src/tgedr/dataops/source/local_fs_file_source.py +0 -71
  39. tgedr_dataops-0.0.36/src/tgedr/dataops/source/pd_df_s3_source.py +0 -51
  40. tgedr_dataops-0.0.36/src/tgedr/dataops/source/s3_delta_table.py +0 -75
  41. tgedr_dataops-0.0.36/src/tgedr/dataops/source/source.py +0 -51
  42. tgedr_dataops-0.0.36/src/tgedr/dataops/store/fs_single_partition_parquet.py +0 -231
  43. tgedr_dataops-0.0.36/src/tgedr/dataops/store/local_fs_single_partition_parquet.py +0 -24
  44. tgedr_dataops-0.0.36/src/tgedr/dataops/store/s3_single_partition_parquet.py +0 -102
  45. tgedr_dataops-0.0.36/src/tgedr/dataops/store/spark_delta.py +0 -369
  46. tgedr_dataops-0.0.36/src/tgedr/dataops/store/store.py +0 -49
  47. tgedr_dataops-0.0.36/src/tgedr/dataops/utils_reflection.py +0 -134
  48. tgedr_dataops-0.0.36/src/tgedr/dataops/validation/abs.py +0 -46
  49. tgedr_dataops-0.0.36/src/tgedr/dataops/validation/pandas.py +0 -10
  50. tgedr_dataops-0.0.36/src/tgedr/dataops/validation/pyspark.py +0 -10
  51. tgedr_dataops-0.0.36/src/tgedr_dataops.egg-info/PKG-INFO +0 -19
  52. tgedr_dataops-0.0.36/src/tgedr_dataops.egg-info/SOURCES.txt +0 -41
  53. tgedr_dataops-0.0.36/src/tgedr_dataops.egg-info/requires.txt +0 -6
  54. tgedr_dataops-0.0.36/src/tgedr_dataops.egg-info/top_level.txt +0 -1
  55. {tgedr_dataops-0.0.36 → tgedr_dataops-1.0.1}/LICENSE +0 -0
  56. {tgedr_dataops-0.0.36 → tgedr_dataops-1.0.1}/setup.cfg +0 -0
  57. {tgedr_dataops-0.0.36/src/tgedr/dataops → tgedr_dataops-1.0.1/src/tgedr_dataops}/__init__.py +0 -0
  58. {tgedr_dataops-0.0.36/src/tgedr/dataops → tgedr_dataops-1.0.1/src/tgedr_dataops}/sink/__init__.py +0 -0
  59. {tgedr_dataops-0.0.36/src/tgedr/dataops → tgedr_dataops-1.0.1/src/tgedr_dataops}/source/__init__.py +0 -0
  60. {tgedr_dataops-0.0.36 → tgedr_dataops-1.0.1}/src/tgedr_dataops.egg-info/dependency_links.txt +0 -0
@@ -0,0 +1,72 @@
1
+ Metadata-Version: 2.4
2
+ Name: tgedr-dataops
3
+ Version: 1.0.1
4
+ Summary: data operations related code
5
+ Author-email: joao tiago viegas <3536754+jtviegas@users.noreply.github.com>
6
+ Requires-Python: >=3.11
7
+ Description-Content-Type: text/markdown
8
+ License-File: LICENSE
9
+ Requires-Dist: tgedr-dataops-abs==0.0.3
10
+ Requires-Dist: s3fs==2024.5.0
11
+ Requires-Dist: boto3==1.34.106
12
+ Requires-Dist: openpyxl==3.1.2
13
+ Requires-Dist: pyarrow>=23.0.0
14
+ Requires-Dist: moto>=5.1.20
15
+ Dynamic: license-file
16
+
17
+ # data-ops
18
+
19
+ ![Coverage](./coverage.svg)
20
+ [![PyPI](https://img.shields.io/pypi/v/tgedr-dataops)](https://pypi.org/project/tgedr-dataops/)
21
+
22
+
23
+
24
+ data operations related code
25
+
26
+ ## motivation
27
+ *data-ops* is a library with tested and used code aligning on some standards regarding code structure and quality and to avoid reinventing the wheel
28
+
29
+ ## installation
30
+ `pip install tgedr-dataops`
31
+
32
+ ## package namespaces and its contents
33
+
34
+ #### commons
35
+ - __S3Connector__: base class to be extended, providing a connection session with aws s3 resources
36
+ - __utils_fs__: utility module with file system related functions ([example](tests/tgedr_dataops/commons/test_utils_fs.py))
37
+
38
+ #### quality
39
+ - __PandasValidation__ : __GreatExpectationsValidation__ implementation to validate pandas dataframes with Great Expectations library ([example](tests/tgedr_dataops/quality/test_pandas_validation.py))
40
+
41
+
42
+ #### sink
43
+ - __LocalFsFileSink__: __Sink__ implementation class used to save/persist an object/file to a local fs location ([example](tests/tgedr_dataops/sink/test_localfs_file_sink.py))
44
+ - __S3FileSink__: __Sink__ implementation class used to save/persist a local object/file to an s3 bucket ([example](tests/tgedr_dataops/sink/test_s3_file_sink.py))
45
+
46
+ #### source
47
+ - __AbstractS3FileSource__: abstract __Source__ class used to retrieve objects/files from s3 bucket to local fs location circumventing some formats download limitation
48
+ - __LocalFsFileSource__: __Source__ implementation class used to retrieve local objects/files to another local fs location ([example](tests/tgedr_dataops/source/test_localfs_file_source.py))
49
+ - __PdDfS3Source__: __Source__ implementation class used to read a pandas dataframe from s3, whether a csv or an excel (xslx) file ([example csv](tests/tgedr_dataops/source/test_pd_df_s3_source_csv.py), [example excel](tests/tgedr_dataops/source/test_pd_df_s3_source_excel.py))
50
+ - __S3FileCopy__: __Source__ implementation class used to copy objects/files from an s3 bucket to another s3 bucket ([example](tests/tgedr_dataops/source/test_s3_copy.py))
51
+ - __S3FileExtendedSource__: __Source__ implementation class used to retrieve objects/files from s3 bucket to local fs location with the extra method `get_metadata` providing sile metadata ("LastModified", "ContentLength", "ETag", "VersionId", "ContentType")([example](tests/tgedr_dataops/source/test_s3_file_extended_source.py))
52
+ - __S3FileSource__: __Source__ implementation class used to retrieve objects/files from s3 bucket to local fs location ([example](tests/tgedr_dataops/source/test_s3_file_source.py))
53
+
54
+ #### store
55
+ - __FsSinglePartitionParquetStore__ : abstract __Store__ implementation defining persistence on parquet files with an optional single partition, regardless of the location it should persist
56
+ - __LocalFsSinglePartitionParquetStore__ : __FsSinglePartitionParquetStore__ implementation using local file system ([example](tests/tgedr_dataops/store/test_local_fs_single_partition_parquet.py))
57
+ - __S3FsSinglePartitionParquetStore__ : __FsSinglePartitionParquetStore__ implementation using aws s3 file system ([example](tests/tgedr_dataops/store/MANUAL_test_s3_single_partition_parquet.py))
58
+
59
+
60
+ ## development
61
+ - main requirements:
62
+ - _uv_
63
+ - _bash_
64
+ - Clone the repository like this:
65
+
66
+ ``` bash
67
+ git clone git@github.com:tgedr/pycommons
68
+ ```
69
+ - cd into the folder: `cd pycommons`
70
+ - install requirements: `./helper.sh reqs`
71
+
72
+
@@ -0,0 +1,56 @@
1
+ # data-ops
2
+
3
+ ![Coverage](./coverage.svg)
4
+ [![PyPI](https://img.shields.io/pypi/v/tgedr-dataops)](https://pypi.org/project/tgedr-dataops/)
5
+
6
+
7
+
8
+ data operations related code
9
+
10
+ ## motivation
11
+ *data-ops* is a library with tested and used code aligning on some standards regarding code structure and quality and to avoid reinventing the wheel
12
+
13
+ ## installation
14
+ `pip install tgedr-dataops`
15
+
16
+ ## package namespaces and its contents
17
+
18
+ #### commons
19
+ - __S3Connector__: base class to be extended, providing a connection session with aws s3 resources
20
+ - __utils_fs__: utility module with file system related functions ([example](tests/tgedr_dataops/commons/test_utils_fs.py))
21
+
22
+ #### quality
23
+ - __PandasValidation__ : __GreatExpectationsValidation__ implementation to validate pandas dataframes with Great Expectations library ([example](tests/tgedr_dataops/quality/test_pandas_validation.py))
24
+
25
+
26
+ #### sink
27
+ - __LocalFsFileSink__: __Sink__ implementation class used to save/persist an object/file to a local fs location ([example](tests/tgedr_dataops/sink/test_localfs_file_sink.py))
28
+ - __S3FileSink__: __Sink__ implementation class used to save/persist a local object/file to an s3 bucket ([example](tests/tgedr_dataops/sink/test_s3_file_sink.py))
29
+
30
+ #### source
31
+ - __AbstractS3FileSource__: abstract __Source__ class used to retrieve objects/files from s3 bucket to local fs location circumventing some formats download limitation
32
+ - __LocalFsFileSource__: __Source__ implementation class used to retrieve local objects/files to another local fs location ([example](tests/tgedr_dataops/source/test_localfs_file_source.py))
33
+ - __PdDfS3Source__: __Source__ implementation class used to read a pandas dataframe from s3, whether a csv or an excel (xslx) file ([example csv](tests/tgedr_dataops/source/test_pd_df_s3_source_csv.py), [example excel](tests/tgedr_dataops/source/test_pd_df_s3_source_excel.py))
34
+ - __S3FileCopy__: __Source__ implementation class used to copy objects/files from an s3 bucket to another s3 bucket ([example](tests/tgedr_dataops/source/test_s3_copy.py))
35
+ - __S3FileExtendedSource__: __Source__ implementation class used to retrieve objects/files from s3 bucket to local fs location with the extra method `get_metadata` providing sile metadata ("LastModified", "ContentLength", "ETag", "VersionId", "ContentType")([example](tests/tgedr_dataops/source/test_s3_file_extended_source.py))
36
+ - __S3FileSource__: __Source__ implementation class used to retrieve objects/files from s3 bucket to local fs location ([example](tests/tgedr_dataops/source/test_s3_file_source.py))
37
+
38
+ #### store
39
+ - __FsSinglePartitionParquetStore__ : abstract __Store__ implementation defining persistence on parquet files with an optional single partition, regardless of the location it should persist
40
+ - __LocalFsSinglePartitionParquetStore__ : __FsSinglePartitionParquetStore__ implementation using local file system ([example](tests/tgedr_dataops/store/test_local_fs_single_partition_parquet.py))
41
+ - __S3FsSinglePartitionParquetStore__ : __FsSinglePartitionParquetStore__ implementation using aws s3 file system ([example](tests/tgedr_dataops/store/MANUAL_test_s3_single_partition_parquet.py))
42
+
43
+
44
+ ## development
45
+ - main requirements:
46
+ - _uv_
47
+ - _bash_
48
+ - Clone the repository like this:
49
+
50
+ ``` bash
51
+ git clone git@github.com:tgedr/pycommons
52
+ ```
53
+ - cd into the folder: `cd pycommons`
54
+ - install requirements: `./helper.sh reqs`
55
+
56
+
@@ -0,0 +1,135 @@
1
+ [project]
2
+ name = "tgedr-dataops"
3
+ version = "1.0.1"
4
+ description = "data operations related code"
5
+ authors = [
6
+ {name = "joao tiago viegas",email = "3536754+jtviegas@users.noreply.github.com"}
7
+ ]
8
+ readme = "README.md"
9
+ requires-python = ">=3.11"
10
+
11
+ dependencies = [
12
+ "tgedr-dataops-abs==0.0.3",
13
+ "s3fs==2024.5.0",
14
+ "boto3==1.34.106",
15
+ "openpyxl==3.1.2",
16
+ "pyarrow>=23.0.0",
17
+ "moto>=5.1.20",
18
+ ]
19
+ [dependency-groups]
20
+ dev = [
21
+ "pre-commit~=4.2.0",
22
+ "pytest~=8.3.5",
23
+ "pytest-bdd~=8.1.0",
24
+ "pytest-cov~=4.1.0",
25
+ "pytest-mock~=3.15.0",
26
+ "ruff==0.9.10",
27
+ "bandit==1.8.3",
28
+ "safety==3.5.1",
29
+ "typer<0.17.0",
30
+ "genbadge[coverage]>=1.1.3",
31
+ ]
32
+
33
+ # [project.scripts]
34
+ # run = "tgedr.pycommons.entrypoint:entrypoint"
35
+
36
+ [build-system]
37
+ requires = ["setuptools>=78.1.0", "wheel>=0.45.1"]
38
+ build-backend = "setuptools.build_meta"
39
+
40
+ [tool.setuptools.packages.find]
41
+ where = ["src"]
42
+
43
+ [tool.setuptools]
44
+ include-package-data = true
45
+
46
+ [tool.setuptools.package-data]
47
+ "*" = ["CHANGELOG"]
48
+
49
+ [tool.coverage.paths]
50
+ source = ["src/"]
51
+
52
+ [tool.coverage.run]
53
+ source = ["src/"]
54
+ include = ["src/*"]
55
+ omit = [
56
+ "*/tests/*",
57
+ "*/test_*",
58
+ "*/__pycache__/*",
59
+ "*/migrations/*",
60
+ "*/venv/*",
61
+ "*/.venv/*"
62
+ ]
63
+
64
+ [tool.coverage.report]
65
+ exclude_lines = [
66
+ "pragma: no cover",
67
+ "def __repr__",
68
+ "raise AssertionError",
69
+ "raise NotImplementedError",
70
+ "if __name__ == .__main__.:",
71
+ "if TYPE_CHECKING:",
72
+ ]
73
+ show_missing = true
74
+ skip_covered = false
75
+ skip_empty = false
76
+
77
+ [tool.pytest.ini_options]
78
+ # bdd_features_base_dir = "documentation/features"
79
+ pythonpath = "."
80
+
81
+ [tool.ruff]
82
+ exclude = [
83
+ ".bzr",
84
+ ".direnv",
85
+ ".eggs",
86
+ ".git",
87
+ ".git-rewrite",
88
+ ".hg",
89
+ ".ipynb_checkpoints",
90
+ ".mypy_cache",
91
+ ".nox",
92
+ ".pants.d",
93
+ ".pyenv",
94
+ ".pytest_cache",
95
+ ".pytype",
96
+ ".ruff_cache",
97
+ ".svn",
98
+ ".tox",
99
+ ".venv",
100
+ ".vscode",
101
+ "__pypackages__",
102
+ "_build",
103
+ "buck-out",
104
+ "build",
105
+ "dist",
106
+ "node_modules",
107
+ "site-packages",
108
+ "venv",
109
+ "tests/",
110
+ "typings/"
111
+ ]
112
+
113
+ line-length = 120
114
+ indent-width = 4
115
+
116
+ [tool.ruff.lint]
117
+ select = ["ALL"]
118
+ ignore = ["D203", "S101", "D104", "INP001", "D213", "COM812", "I001",
119
+ "D401", "D407", "RET504", "PLR2004", "FA102", "E501", "EXE002", "PLR0913",
120
+ "PLR0912", "C901", "PLR0911", "D413", "N818", "B024", "ANN401", "SIM300",
121
+ "FBT001", "FBT002", "G004", "TRY003", "EM102", "EM101", "PD015", "PD901"]
122
+ fixable = ["ALL"]
123
+ unfixable = []
124
+ # Allow unused variables when underscore-prefixed.
125
+ dummy-variable-rgx = "^(_+|(_+[a-zA-Z0-9_]*[a-zA-Z0-9]+?))$"
126
+
127
+ [tool.ruff.format]
128
+ # Like Black, use double quotes for strings.
129
+ quote-style = "double"
130
+ # Like Black, indent with spaces, rather than tabs.
131
+ indent-style = "space"
132
+ # Like Black, respect magic trailing commas.
133
+ skip-magic-trailing-comma = false
134
+ # Like Black, automatically detect the appropriate line ending.
135
+ line-ending = "auto"
@@ -1,17 +1,30 @@
1
+ """AWS S3 connector module.
2
+
3
+ This module provides the S3Connector class for establishing and managing
4
+ connections to AWS S3 resources.
5
+ """
1
6
  import os
2
7
  import boto3
3
8
 
4
9
 
5
10
  class S3Connector:
6
- """utility base class to be extended, providing a connection session with aws s3 resources"""
11
+ """utility base class to be extended, providing a connection session with aws s3 resources."""
7
12
 
8
- def __init__(self):
13
+ def __init__(self) -> None:
14
+ """Initialize the S3Connector with empty session, resource, and client attributes."""
9
15
  self.__resource = None
10
16
  self.__session = None
11
17
  self.__client = None
12
18
 
13
19
  @property
14
- def _session(self): # pragma: no cover
20
+ def _session(self) -> boto3.Session:
21
+ """Get or create a boto3 session.
22
+
23
+ Returns
24
+ -------
25
+ boto3.Session
26
+ The boto3 session instance, using credentials if configured.
27
+ """
15
28
  if self.__session is None:
16
29
  if "1" == os.getenv("S3_CONNECTOR_USE_CREDENTIALS", default="0"):
17
30
  self.__session = boto3.Session(
@@ -26,13 +39,27 @@ class S3Connector:
26
39
  return self.__session
27
40
 
28
41
  @property
29
- def _resource(self):
42
+ def _resource(self) -> boto3.resources.base.ServiceResource:
43
+ """Get or create an S3 resource.
44
+
45
+ Returns
46
+ -------
47
+ boto3.resources.base.ServiceResource
48
+ The boto3 S3 resource instance.
49
+ """
30
50
  if self.__resource is None:
31
51
  self.__resource = self._session.resource("s3")
32
52
  return self.__resource
33
53
 
34
54
  @property
35
- def _client(self):
55
+ def _client(self) -> boto3.client:
56
+ """Get or create an S3 client.
57
+
58
+ Returns
59
+ -------
60
+ boto3.client
61
+ The boto3 S3 client instance.
62
+ """
36
63
  if self.__client is None:
37
64
  self.__client = self._session.client("s3")
38
65
  return self.__client
@@ -0,0 +1,187 @@
1
+ """Filesystem utility functions for file and path operations.
2
+
3
+ This module provides utilities for:
4
+ - Creating temporary files and directories
5
+ - Parsing and processing S3 URLs and paths
6
+ - Extracting URL protocols
7
+ - Generating file hashes
8
+ """
9
+ import tempfile
10
+ import re
11
+ import hashlib
12
+ from pathlib import Path
13
+
14
+
15
+ def temp_dir(root: str | None = None, suffix: str | None = None, prefix: str | None = None) -> str:
16
+ """Create a temporary directory and return its path.
17
+
18
+ Parameters
19
+ ----------
20
+ root : str | None
21
+ Directory where the temporary directory will be created.
22
+ suffix : str | None
23
+ Suffix for the temporary directory name.
24
+ prefix : str | None
25
+ Prefix for the temporary directory name.
26
+
27
+ Returns
28
+ -------
29
+ str
30
+ Path to the created temporary directory.
31
+ """
32
+ return tempfile.mkdtemp(suffix=suffix, prefix=prefix, dir=root)
33
+
34
+
35
+ def temp_file(
36
+ root: str | None = None, suffix: str | None = None, prefix: str | None = None,
37
+ discard_handle: bool = True
38
+ ) -> str | tuple[int, str]:
39
+ """Create a temporary file and return its path or handle and path.
40
+
41
+ Parameters
42
+ ----------
43
+ root : str | None
44
+ Directory where the temporary file will be created.
45
+ suffix : str | None
46
+ Suffix for the temporary file name.
47
+ prefix : str | None
48
+ Prefix for the temporary file name.
49
+ discard_handle : bool
50
+ If True, return only the file path. If False, return tuple of (handle, path).
51
+
52
+ Returns
53
+ -------
54
+ str | tuple[int, str]
55
+ File path if discard_handle is True, otherwise tuple of (file handle, file path).
56
+ """
57
+ h, f = tempfile.mkstemp(suffix=suffix, prefix=prefix, dir=root)
58
+ if discard_handle:
59
+ return f
60
+ return (h, f)
61
+
62
+
63
+ def resolve_url_protocol(url: str) -> str:
64
+ """Extract the protocol from a URL.
65
+
66
+ Parameters
67
+ ----------
68
+ url : str
69
+ The URL to extract the protocol from.
70
+
71
+ Returns
72
+ -------
73
+ str
74
+ The protocol (e.g., 'http://', 'https://') or None if no protocol is found.
75
+ """
76
+ result = None
77
+ group_match = re.search("(.*://).*", url)
78
+ if group_match is not None:
79
+ result = group_match.group(1)
80
+ return result
81
+
82
+
83
+ def resolve_s3_protocol(url: str) -> str:
84
+ """Extract the S3 protocol from a URL.
85
+
86
+ Parameters
87
+ ----------
88
+ url : str
89
+ The URL to extract the S3 protocol from.
90
+
91
+ Returns
92
+ -------
93
+ str
94
+ The S3 protocol (e.g., 's3://', 's3a://') or None if no S3 protocol is found.
95
+ """
96
+ result = None
97
+ group_match = re.search("(s3[a]?://).*", url)
98
+ if group_match is not None:
99
+ result = group_match.group(1)
100
+ return result
101
+
102
+
103
+ def remove_s3_protocol(url: str) -> str:
104
+ """Remove the S3 protocol prefix from a URL.
105
+
106
+ Parameters
107
+ ----------
108
+ url : str
109
+ The S3 URL to remove the protocol from.
110
+
111
+ Returns
112
+ -------
113
+ str
114
+ The URL without the S3 protocol prefix (s3:// or s3a://).
115
+ """
116
+ if url.startswith("s3://"):
117
+ result = url[5:]
118
+ elif url.startswith("s3a://"):
119
+ result = url[6:]
120
+ else:
121
+ result = url
122
+ return result
123
+
124
+
125
+ def process_s3_path(path: str) -> tuple[str, str]:
126
+ """Extract bucket and key from an S3 path.
127
+
128
+ Parameters
129
+ ----------
130
+ path : str
131
+ The S3 path to process (with or without protocol).
132
+
133
+ Returns
134
+ -------
135
+ tuple[str, str]
136
+ A tuple containing (bucket, key).
137
+ """
138
+ no_protocol_path = remove_s3_protocol(path)
139
+ path_elements = no_protocol_path.split("/")
140
+ bucket = path_elements[0]
141
+ key = "/".join(path_elements[1:])
142
+ return (bucket, key)
143
+
144
+
145
+ def process_s3_url(url: str) -> tuple[str, str, str]:
146
+ """Extract protocol, bucket, and key from an S3 URL.
147
+
148
+ Parameters
149
+ ----------
150
+ url : str
151
+ The S3 URL to process.
152
+
153
+ Returns
154
+ -------
155
+ tuple[str, str, str]
156
+ A tuple containing (protocol, bucket, key).
157
+ """
158
+ protocol = resolve_s3_protocol(url)
159
+ no_protocol_url = remove_s3_protocol(url)
160
+ path_elements = no_protocol_url.split("/")
161
+ bucket = path_elements[0]
162
+ key = "/".join(path_elements[1:])
163
+ return ("" if protocol is None else protocol, bucket, key)
164
+
165
+
166
+ def hash_file(filepath: str, hash_func=hashlib.sha256) -> str: # noqa: ANN001
167
+ """Generate a hash for a file.
168
+
169
+ Args:
170
+ filepath (str): The path to the file.
171
+ hash_func: A hashlib hash function, e.g., hashlib.md5().
172
+
173
+ Returns:
174
+ str: The hexadecimal hash string of the file.
175
+ """
176
+ # Initialize the hash object
177
+ hasher = hash_func()
178
+
179
+ # Open the file in binary read mode
180
+ with Path(filepath).open("rb") as file:
181
+ # Read the file in chunks to avoid using too much memory
182
+ chunk_size = 8192
183
+ while chunk := file.read(chunk_size):
184
+ hasher.update(chunk)
185
+
186
+ # Return the hexadecimal digest of the hash
187
+ return hasher.hexdigest()
@@ -0,0 +1,21 @@
1
+ """Pandas DataFrame validation implementation module.
2
+
3
+ This module provides the Pandas-specific implementation of Great Expectations validation.
4
+ """
5
+ from great_expectations.execution_engine import ExecutionEngine
6
+ from great_expectations.execution_engine import PandasExecutionEngine
7
+ from tgedr_dataops_abs.great_expectations_validation import GreatExpectationsValidation
8
+
9
+
10
+ class PandasValidation(GreatExpectationsValidation):
11
+ """Pandas DataFrame validation implementation."""
12
+
13
+ def _get_execution_engine(self, batch_data_dict: dict) -> ExecutionEngine:
14
+ """Get the execution engine used by the validation implementation.
15
+
16
+ Returns
17
+ -------
18
+ ExecutionEngine
19
+ The execution engine instance.
20
+ """
21
+ return PandasExecutionEngine(batch_data_dict=batch_data_dict)
@@ -0,0 +1,77 @@
1
+ """Local filesystem sink implementation for persisting files.
2
+
3
+ This module provides:
4
+ - LocalFsFileSink: sink class for saving/persisting files to local filesystem.
5
+ """
6
+
7
+ import logging
8
+ import shutil
9
+ from pathlib import Path
10
+ from typing import Any
11
+
12
+ from tgedr_dataops_abs.sink import Sink, SinkException
13
+
14
+
15
+ logger = logging.getLogger(__name__)
16
+
17
+
18
+ class LocalFsFileSink(Sink):
19
+ """sink class used to save/persist an object/file to a local fs location."""
20
+
21
+ CONTEXT_SOURCE_PATH = "source"
22
+ CONTEXT_TARGET_PATH = "target"
23
+
24
+ def put(self, context: dict[str, Any] | None = None) -> Any:
25
+ """Copy a file from source to target on local filesystem.
26
+
27
+ Parameters
28
+ ----------
29
+ context : dict[str, Any], optional
30
+ Context dictionary containing 'source' and 'target' paths.
31
+
32
+ Raises
33
+ ------
34
+ SinkException
35
+ If source or target context is missing.
36
+ """
37
+ logger.info(f"[put|in] ({context})")
38
+
39
+ if self.CONTEXT_SOURCE_PATH not in context:
40
+ raise SinkException(f"you must provide context for {self.CONTEXT_SOURCE_PATH}")
41
+ if self.CONTEXT_TARGET_PATH not in context:
42
+ raise SinkException(f"you must provide context for {self.CONTEXT_TARGET_PATH}")
43
+
44
+ source = context[self.CONTEXT_SOURCE_PATH]
45
+ target = context[self.CONTEXT_TARGET_PATH]
46
+
47
+ shutil.copy(source, target)
48
+ logger.info("[put|out]")
49
+
50
+ def delete(self, context: dict[str, Any] | None = None) -> None:
51
+ """Delete a file or directory from local filesystem.
52
+
53
+ Parameters
54
+ ----------
55
+ context : dict[str, Any], optional
56
+ Context dictionary containing 'target' path.
57
+
58
+ Raises
59
+ ------
60
+ SinkException
61
+ If target context is missing or target is neither file nor directory.
62
+ """
63
+ logger.info(f"[delete|in] ({context})")
64
+
65
+ if self.CONTEXT_TARGET_PATH not in context:
66
+ raise SinkException(f"you must provide context for {self.CONTEXT_TARGET_PATH}")
67
+
68
+ target = Path(context[self.CONTEXT_TARGET_PATH])
69
+
70
+ if target.is_file():
71
+ target.unlink()
72
+ elif target.is_dir():
73
+ shutil.rmtree(target)
74
+ else:
75
+ raise SinkException(f"[delete] is it a dir or a folder? {target}")
76
+
77
+ logger.info("[delete|out]")