tgedr-dataops 0.0.37__py3-none-any.whl → 1.0.2__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (51) hide show
  1. {tgedr/dataops → tgedr_dataops}/commons/s3_connector.py +32 -5
  2. tgedr_dataops/commons/utils_fs.py +187 -0
  3. tgedr_dataops/quality/pandas_validation.py +21 -0
  4. tgedr_dataops/sink/local_fs_file_sink.py +77 -0
  5. {tgedr/dataops → tgedr_dataops}/sink/s3_file_sink.py +47 -11
  6. tgedr_dataops/source/abstract_s3_file_source.py +72 -0
  7. tgedr_dataops/source/local_fs_file_source.py +108 -0
  8. tgedr_dataops/source/pd_df_s3_source.py +130 -0
  9. {tgedr/dataops → tgedr_dataops}/source/s3_file_copy.py +64 -28
  10. tgedr_dataops/source/s3_file_extended_source.py +68 -0
  11. {tgedr/dataops → tgedr_dataops}/source/s3_file_source.py +63 -27
  12. tgedr_dataops/store/fs_single_partition_parquet.py +331 -0
  13. tgedr_dataops/store/local_fs_single_partition_parquet.py +56 -0
  14. tgedr_dataops/store/s3_single_partition_parquet.py +193 -0
  15. tgedr_dataops-1.0.2.dist-info/METADATA +72 -0
  16. tgedr_dataops-1.0.2.dist-info/RECORD +22 -0
  17. {tgedr_dataops-0.0.37.dist-info → tgedr_dataops-1.0.2.dist-info}/WHEEL +1 -1
  18. tgedr_dataops-1.0.2.dist-info/top_level.txt +1 -0
  19. tgedr/dataops/chain.py +0 -51
  20. tgedr/dataops/commons/dataset.py +0 -23
  21. tgedr/dataops/commons/metadata.py +0 -172
  22. tgedr/dataops/commons/utils_fs.py +0 -85
  23. tgedr/dataops/commons/utils_spark.py +0 -87
  24. tgedr/dataops/etl.py +0 -112
  25. tgedr/dataops/processor.py +0 -27
  26. tgedr/dataops/sink/local_fs_file_sink.py +0 -47
  27. tgedr/dataops/sink/sink.py +0 -46
  28. tgedr/dataops/source/abstract_s3_file_source.py +0 -43
  29. tgedr/dataops/source/delta_table_source.py +0 -49
  30. tgedr/dataops/source/local_delta_table.py +0 -47
  31. tgedr/dataops/source/local_fs_file_source.py +0 -71
  32. tgedr/dataops/source/pd_df_s3_source.py +0 -76
  33. tgedr/dataops/source/s3_delta_table.py +0 -75
  34. tgedr/dataops/source/s3_file_extended_source.py +0 -39
  35. tgedr/dataops/source/source.py +0 -51
  36. tgedr/dataops/store/fs_single_partition_parquet.py +0 -231
  37. tgedr/dataops/store/local_fs_single_partition_parquet.py +0 -24
  38. tgedr/dataops/store/s3_single_partition_parquet.py +0 -102
  39. tgedr/dataops/store/spark_delta.py +0 -369
  40. tgedr/dataops/store/store.py +0 -49
  41. tgedr/dataops/utils_reflection.py +0 -134
  42. tgedr/dataops/validation/abs.py +0 -46
  43. tgedr/dataops/validation/pandas.py +0 -10
  44. tgedr/dataops/validation/pyspark.py +0 -10
  45. tgedr_dataops-0.0.37.dist-info/METADATA +0 -21
  46. tgedr_dataops-0.0.37.dist-info/RECORD +0 -38
  47. tgedr_dataops-0.0.37.dist-info/top_level.txt +0 -1
  48. {tgedr/dataops → tgedr_dataops}/__init__.py +0 -0
  49. {tgedr/dataops → tgedr_dataops}/sink/__init__.py +0 -0
  50. {tgedr/dataops → tgedr_dataops}/source/__init__.py +0 -0
  51. {tgedr_dataops-0.0.37.dist-info → tgedr_dataops-1.0.2.dist-info/licenses}/LICENSE +0 -0
@@ -0,0 +1,130 @@
1
+ """Module for reading pandas DataFrames from S3 sources.
2
+
3
+ This module provides:
4
+ - PdDfS3Source: class for reading CSV and Excel files from S3 into pandas DataFrames.
5
+ """
6
+ from io import StringIO
7
+ import logging
8
+ from typing import Any, ClassVar
9
+ import pandas as pd
10
+
11
+ from tgedr_dataops.commons.utils_fs import process_s3_url
12
+ from tgedr_dataops.source.abstract_s3_file_source import AbstractS3FileSource
13
+ from tgedr_dataops_abs.source import SourceException
14
+
15
+ logger = logging.getLogger()
16
+
17
+
18
+ class PdDfS3Source(AbstractS3FileSource):
19
+ """class used to read a pandas dataframe from a csv file in s3."""
20
+
21
+ CONTEXT_KEY_FILE_FORMAT = "file_format"
22
+ CONTEXT_KEY_SEPARATOR = "sep"
23
+ CONTEXT_KEY_NO_HEADER = "no_header"
24
+ CONTEXT_KEY_COLUMN_NAMES = "column_names"
25
+ CONTEXT_KEY_SCHEMA_TYPES = "schema_types"
26
+ DEFAULT_FORMAT = "csv"
27
+ FORMATS: ClassVar[list[str]] = ["csv", "xlsx"]
28
+ DEFAULT_SEPARATOR = ","
29
+
30
+ def __init__(self, config: dict[str, Any] | None = None) -> None:
31
+ """Initialize the PdDfS3Source with optional configuration.
32
+
33
+ Parameters
34
+ ----------
35
+ config : dict[str, Any], optional
36
+ Configuration dictionary for S3 connection, by default None.
37
+ """
38
+ super().__init__(config=config)
39
+
40
+ def get(self, context: dict[str, Any] | None = None) -> pd.DataFrame:
41
+ """Retrieve and load a pandas DataFrame from S3.
42
+
43
+ Supports CSV and Excel formats.
44
+
45
+ Parameters
46
+ ----------
47
+ context : dict[str, Any], optional
48
+ Context dictionary containing:
49
+ - 'source' or CONTEXT_KEY_URL: S3 URL
50
+ - 'file_format': File format (csv or xlsx), defaults to csv
51
+ - 'sep': CSV separator, defaults to comma
52
+ - 'no_header': If present, CSV has no header row
53
+ - 'column_names': List of column names for CSV
54
+ - 'schema_types': Dictionary of column types for CSV
55
+
56
+ Returns
57
+ -------
58
+ pd.DataFrame
59
+ The loaded pandas DataFrame.
60
+
61
+ Raises
62
+ ------
63
+ SourceException
64
+ If source URL is missing or if file_format is unsupported.
65
+ """
66
+ logger.info(f"[get|in] ({context})")
67
+ result: pd.DataFrame = None
68
+
69
+ if self.CONTEXT_KEY_URL not in context:
70
+ raise SourceException(f"you must provide context for {self.CONTEXT_KEY_URL}")
71
+
72
+ _format: str = self.DEFAULT_FORMAT
73
+ if self.CONTEXT_KEY_FILE_FORMAT in context:
74
+ _format = context[self.CONTEXT_KEY_FILE_FORMAT]
75
+ if _format not in self.FORMATS:
76
+ raise SourceException(f"[get] invalid format: {_format}")
77
+
78
+ result = self.__read_csv(context=context) if "csv" == _format else self.__read_excel(context=context)
79
+
80
+ logger.info(f"[get|out] => {result}")
81
+ return result
82
+
83
+ def __read_csv(self, context: dict[str, Any] | None = None) -> pd.DataFrame:
84
+ """Read a CSV file from S3 into a pandas DataFrame.
85
+
86
+ Parameters
87
+ ----------
88
+ context : dict[str, Any], optional
89
+ Context dictionary with S3 URL and CSV reading options.
90
+
91
+ Returns
92
+ -------
93
+ pd.DataFrame
94
+ The loaded DataFrame from CSV.
95
+ """
96
+ logger.info(f"[__read_csv|in] ({context})")
97
+
98
+ _, bucket, key = process_s3_url(context[self.CONTEXT_KEY_URL])
99
+
100
+ obj = self._client.get_object(Bucket=bucket, Key=key)
101
+ data = obj["Body"].read().decode("utf-8")
102
+
103
+ header = 0 if self.CONTEXT_KEY_NO_HEADER not in context else None
104
+ names = context.get(self.CONTEXT_KEY_COLUMN_NAMES, None)
105
+ dtype = context.get(self.CONTEXT_KEY_SCHEMA_TYPES, None)
106
+ sep = context.get(self.CONTEXT_KEY_SEPARATOR, self.DEFAULT_SEPARATOR)
107
+
108
+ result: pd.DataFrame = pd.read_csv(StringIO(data), sep=sep, header=header, names=names, dtype=dtype)
109
+
110
+ logger.info(f"[__read_csv|out] => {result}")
111
+ return result
112
+
113
+ def __read_excel(self, context: dict[str, Any] | None = None) -> pd.DataFrame:
114
+ """Read an Excel file from S3 into a pandas DataFrame.
115
+
116
+ Parameters
117
+ ----------
118
+ context : dict[str, Any], optional
119
+ Context dictionary containing S3 URL.
120
+
121
+ Returns
122
+ -------
123
+ pd.DataFrame
124
+ The loaded DataFrame from Excel.
125
+ """
126
+ logger.info(f"[__read_excel|in] ({context})")
127
+ src = context[self.CONTEXT_KEY_URL]
128
+ result: pd.DataFrame = pd.read_excel(src, engine="openpyxl")
129
+ logger.info(f"[__read_excel|out] => {result}")
130
+ return result
@@ -1,17 +1,22 @@
1
+ """Module for copying files between S3 buckets.
2
+
3
+ This module provides:
4
+ - S3FileCopy: class for copying objects/files from one S3 bucket to another
5
+ """
1
6
  import logging
2
- import os
3
- from typing import Any, Dict, List, Optional
4
- from tgedr.dataops.commons.s3_connector import S3Connector
7
+ from pathlib import Path
8
+ from typing import Any
9
+ from tgedr_dataops.commons.s3_connector import S3Connector
5
10
 
6
- from tgedr.dataops.source.source import Source, SourceException
7
- from tgedr.dataops.commons.utils_fs import process_s3_path, resolve_s3_protocol
11
+ from tgedr_dataops_abs.source import Source, SourceException
12
+ from tgedr_dataops.commons.utils_fs import process_s3_path, resolve_s3_protocol
8
13
 
9
14
 
10
15
  logger = logging.getLogger(__name__)
11
16
 
12
17
 
13
18
  class S3FileCopy(Source, S3Connector):
14
- """class used to copy objects/files from an s3 bucket to another s3 bucket"""
19
+ """class used to copy objects/files from an s3 bucket to another s3 bucket."""
15
20
 
16
21
  CONTEXT_KEY_SOURCE = "source"
17
22
  CONTEXT_KEY_TARGET = "target"
@@ -19,14 +24,38 @@ class S3FileCopy(Source, S3Connector):
19
24
  CONTEXT_KEY_SUFFIX = "suffix"
20
25
  CONTEXT_KEY_PRESERVE_SOURCE_KEY = "preserve_source_key"
21
26
 
22
- def __init__(self, config: Optional[Dict[str, Any]] = None):
27
+ def __init__(self, config: dict[str, Any] | None = None) -> None:
28
+ """Initialize the S3FileCopy source.
29
+
30
+ Parameters
31
+ ----------
32
+ config : dict[str, Any], optional
33
+ Configuration dictionary containing optional AWS credentials.
34
+ """
23
35
  Source.__init__(self, config=config)
24
36
  S3Connector.__init__(self)
25
37
 
26
- def list(self, context: Optional[Dict[str, Any]] = None) -> List[str]:
38
+ def list(self, context: dict[str, Any] | None = None) -> list[str]:
39
+ """List objects in an S3 bucket matching the given prefix.
40
+
41
+ Parameters
42
+ ----------
43
+ context : dict[str, Any], optional
44
+ Context dictionary containing 'source' (source S3 URL) and optionally 'suffix' to filter results.
45
+
46
+ Returns
47
+ -------
48
+ list[str]
49
+ List of S3 URLs matching the criteria.
50
+
51
+ Raises
52
+ ------
53
+ SourceException
54
+ If the 'source' key is missing from the context.
55
+ """
27
56
  logger.info(f"[list|in] ({context})")
28
57
 
29
- result: List[str] = []
58
+ result: list[str] = []
30
59
  if self.CONTEXT_KEY_SOURCE not in context:
31
60
  raise SourceException(f"you must provide context for {self.CONTEXT_KEY_SOURCE}")
32
61
 
@@ -48,10 +77,27 @@ class S3FileCopy(Source, S3Connector):
48
77
  logger.info(f"[list|out] => result len: {len(result)}")
49
78
  return result
50
79
 
51
- def get(self, context: Optional[Dict[str, Any]] = None) -> Any:
80
+ def get(self, context: dict[str, Any] | None = None) -> Any:
81
+ """Copy a file from one S3 location to another.
82
+
83
+ Parameters
84
+ ----------
85
+ context : dict[str, Any], optional
86
+ Context dictionary containing 'source' (source S3 URL) and 'target' (target S3 URL).
87
+
88
+ Returns
89
+ -------
90
+ Any
91
+ The target S3 URL.
92
+
93
+ Raises
94
+ ------
95
+ SourceException
96
+ If source or target context is missing.
97
+ """
52
98
  logger.info(f"[get|in] ({context})")
53
99
 
54
- result: List[str] = []
100
+ result: list[str] = []
55
101
  if self.CONTEXT_KEY_FILES not in context:
56
102
  raise SourceException(f"you must provide context for {self.CONTEXT_KEY_FILES}")
57
103
  if self.CONTEXT_KEY_TARGET not in context:
@@ -59,9 +105,7 @@ class S3FileCopy(Source, S3Connector):
59
105
 
60
106
  preserve_source_key = False
61
107
  if self.CONTEXT_KEY_PRESERVE_SOURCE_KEY in context:
62
- preserve_source_key = (
63
- True if (str(context[self.CONTEXT_KEY_PRESERVE_SOURCE_KEY]).lower() in ["1", "true"]) else False
64
- )
108
+ preserve_source_key = str(context[self.CONTEXT_KEY_PRESERVE_SOURCE_KEY]).lower() in ["1", "true"]
65
109
  logger.info(f"[get] preserve_source_key: {preserve_source_key}")
66
110
 
67
111
  target_bucket, target_key = process_s3_path(context[self.CONTEXT_KEY_TARGET])
@@ -69,31 +113,23 @@ class S3FileCopy(Source, S3Connector):
69
113
  protocol = "" if s3_protocol is None else s3_protocol
70
114
 
71
115
  files = context[self.CONTEXT_KEY_FILES]
72
- multiple_files: bool = True if (0 < len(files)) else False
73
-
74
116
  for file in files:
75
117
  src_bucket, src_key = process_s3_path(file)
76
-
77
118
  src = {"Bucket": src_bucket, "Key": src_key}
78
119
 
79
- if True == preserve_source_key:
80
- key = os.path.join(target_key, src_key)
120
+ target_key = target_key.rstrip("/") if target_key.endswith("/") else target_key
121
+ if preserve_source_key:
122
+ key = str(Path(target_key) / src_key)
81
123
  else:
82
- key = target_key
83
- if not multiple_files:
84
- if target_key.endswith("/"):
85
- src_key_leaf = os.path.basename(src_key)
86
- key += src_key_leaf
87
- else:
88
- src_key_leaf = os.path.basename(src_key)
89
- key = os.path.join(key, src_key_leaf)
124
+ src_key_leaf = Path(src_key).name
125
+ key = str(Path(target_key) / src_key_leaf)
90
126
 
91
127
  logger.info(
92
128
  f"[get] copying... src bucket: {src_bucket} src key: {src_key} target bucket: {target_bucket} target key: {key}"
93
129
  )
94
130
  self._client.copy(src, target_bucket, key)
95
131
 
96
- result.append(os.path.join(protocol, target_bucket, key))
132
+ result.append(str(Path(protocol) / target_bucket / key))
97
133
 
98
134
  logger.info(f"[get|out] => {result}")
99
135
  return result
@@ -0,0 +1,68 @@
1
+ """S3 file extended source module for retrieving objects from S3 with metadata support.
2
+
3
+ This module provides:
4
+ - S3FileExtendedSource: class for retrieving S3 objects/files with metadata extraction
5
+ """
6
+ import logging
7
+ from typing import Any, ClassVar
8
+
9
+ from tgedr_dataops.source.s3_file_source import S3FileSource
10
+ from tgedr_dataops_abs.source import SourceException
11
+ from tgedr_dataops.commons.utils_fs import process_s3_path
12
+
13
+
14
+ logger = logging.getLogger(__name__)
15
+
16
+
17
+ class S3FileExtendedSource(S3FileSource):
18
+ """Class used to retrieve objects/files from S3 bucket to local filesystem."""
19
+
20
+ METADATA_KEYS: ClassVar[list[str]] = ["LastModified", "ContentLength", "ETag", "VersionId", "ContentType"]
21
+
22
+ def __init__(self, config: dict[str, Any] | None = None) -> None:
23
+ """Initialize the S3FileExtendedSource.
24
+
25
+ Parameters
26
+ ----------
27
+ config : dict[str, Any], optional
28
+ Configuration dictionary containing optional AWS credentials.
29
+ """
30
+ super().__init__(config=config)
31
+
32
+ def get_metadata(self, context: dict[str, Any] | None = None) -> dict[str, Any]:
33
+ """Retrieve metadata for an S3 object.
34
+
35
+ Parameters
36
+ ----------
37
+ context : dict[str, Any], optional
38
+ Context dictionary containing 'source' S3 URL.
39
+
40
+ Returns
41
+ -------
42
+ dict[str, Any]
43
+ Dictionary containing object metadata including size, last modified time, etc.
44
+
45
+ Raises
46
+ ------
47
+ SourceException
48
+ If source context is missing.
49
+ """
50
+ logger.info(f"[get_metadata|in] ({context})")
51
+
52
+ result: dict[str, Any] = {}
53
+ if self.CONTEXT_KEY_SOURCE not in context:
54
+ raise SourceException(f"you must provide context for {self.CONTEXT_KEY_SOURCE}")
55
+
56
+ bucket, key = process_s3_path(context[self.CONTEXT_KEY_SOURCE])
57
+
58
+ o = self._client.head_object(Bucket=bucket, Key=key)
59
+
60
+ for key in list(o.keys()):
61
+ if key in self.METADATA_KEYS:
62
+ if key == "LastModified":
63
+ result[key] = int(o[key].timestamp())
64
+ else:
65
+ result[key] = o[key]
66
+
67
+ logger.info(f"[get_metadata|out] => result len: {len(result)}")
68
+ return result
@@ -1,31 +1,60 @@
1
+ """S3 file source module for retrieving objects from S3 buckets.
2
+
3
+ This module provides:
4
+ - S3FileSource: A source implementation for listing and downloading files from S3.
5
+ """
1
6
  import logging
2
- import os
3
- from typing import Any, Dict, List, Optional
4
- from tgedr.dataops.commons.s3_connector import S3Connector
7
+ from pathlib import Path
8
+ from typing import Any
9
+ from tgedr_dataops.commons.s3_connector import S3Connector
5
10
 
6
- from tgedr.dataops.source.source import Source, SourceException
7
- from tgedr.dataops.commons.utils_fs import remove_s3_protocol, resolve_s3_protocol
11
+ from tgedr_dataops_abs.source import Source, SourceException
12
+ from tgedr_dataops.commons.utils_fs import remove_s3_protocol, resolve_s3_protocol
8
13
 
9
14
 
10
15
  logger = logging.getLogger(__name__)
11
16
 
12
17
 
13
18
  class S3FileSource(Source, S3Connector):
14
- """class used to retrieve objects/files from s3 bucket to local fs location"""
19
+ """class used to retrieve objects/files from s3 bucket to local fs location."""
15
20
 
16
21
  CONTEXT_KEY_SOURCE = "source"
17
22
  CONTEXT_KEY_TARGET = "target"
18
23
  CONTEXT_KEY_FILES = "files"
19
24
  CONTEXT_KEY_SUFFIX = "suffix"
20
25
 
21
- def __init__(self, config: Optional[Dict[str, Any]] = None):
26
+ def __init__(self, config: dict[str, Any] | None = None) -> None:
27
+ """Initialize the S3FileSource.
28
+
29
+ Parameters
30
+ ----------
31
+ config : dict[str, Any], optional
32
+ Configuration dictionary containing optional AWS credentials.
33
+ """
22
34
  Source.__init__(self, config=config)
23
35
  S3Connector.__init__(self)
24
36
 
25
- def list(self, context: Optional[Dict[str, Any]] = None) -> List[str]:
37
+ def list(self, context: dict[str, Any] | None = None) -> list[str]:
38
+ """List files in the S3 bucket.
39
+
40
+ Parameters
41
+ ----------
42
+ context : dict[str, Any], optional
43
+ Context dictionary containing 'source' (S3 URL) and optionally 'suffix' for filtering.
44
+
45
+ Returns
46
+ -------
47
+ list[str]
48
+ List of S3 object URLs in the source bucket/prefix.
49
+
50
+ Raises
51
+ ------
52
+ SourceException
53
+ If source context is missing.
54
+ """
26
55
  logger.info(f"[list|in] ({context})")
27
56
 
28
- result: List[str] = []
57
+ result: list[str] = []
29
58
  if self.CONTEXT_KEY_SOURCE not in context:
30
59
  raise SourceException(f"you must provide context for {self.CONTEXT_KEY_SOURCE}")
31
60
 
@@ -50,42 +79,49 @@ class S3FileSource(Source, S3Connector):
50
79
  logger.info(f"[list|out] => result len: {len(result)}")
51
80
  return result
52
81
 
53
- def get(self, context: Optional[Dict[str, Any]] = None) -> Any:
82
+ def get(self, context: dict[str, Any] | None = None) -> Any:
83
+ """Download a file from S3 to local filesystem.
84
+
85
+ Parameters
86
+ ----------
87
+ context : dict[str, Any], optional
88
+ Context dictionary containing 'source' (S3 URL) and 'target' (local path).
89
+
90
+ Returns
91
+ -------
92
+ Any
93
+ The local target file path.
94
+
95
+ Raises
96
+ ------
97
+ SourceException
98
+ If source or target context is missing.
99
+ """
54
100
  logger.info(f"[get|in] ({context})")
55
101
 
56
- result: List[str] = []
102
+ result: list[str] = []
57
103
  if self.CONTEXT_KEY_FILES not in context:
58
104
  raise SourceException(f"you must provide context for {self.CONTEXT_KEY_FILES}")
59
105
  if self.CONTEXT_KEY_TARGET not in context:
60
106
  raise SourceException(f"you must provide context for {self.CONTEXT_KEY_TARGET}")
61
107
 
62
- preserve_structure = True
63
-
64
108
  files = context[self.CONTEXT_KEY_FILES]
65
109
  target = context[self.CONTEXT_KEY_TARGET]
66
110
 
67
- target_is_dir: bool = False
68
- if os.path.isdir(target):
69
- target_is_dir = True
111
+ target_is_dir: bool = Path(target).is_dir()
112
+ target = target.rstrip("/") if target.endswith("/") else target
70
113
 
71
114
  for file in files:
72
115
  path_elements = remove_s3_protocol(file).split("/")
73
116
  bucket = path_elements[0]
74
117
  key = "/".join(path_elements[1:])
75
118
  filename = path_elements[-1]
76
-
77
- if target_is_dir:
78
- if preserve_structure:
79
- local_file = os.path.join(target, key)
80
- else:
81
- local_file = os.path.join(target, filename)
82
- else:
83
- local_file = target
119
+ local_file = str(Path(target) / filename) if target_is_dir else target
84
120
 
85
121
  # assure we have that path there
86
- local_folder = os.path.dirname(local_file)
87
- if not os.path.isdir(local_folder):
88
- os.makedirs(local_folder)
122
+ local_folder = Path(local_file).parent
123
+ if not local_folder.is_dir():
124
+ local_folder.mkdir(parents=True)
89
125
 
90
126
  logger.info(f"[get] bucket: {bucket} key: {key} file: {file} local_file: {local_file}")
91
127
  self._client.download_file(Bucket=bucket, Key=key, Filename=local_file)