tgedr-dataops 0.0.32__tar.gz → 0.0.33__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (42) hide show
  1. {tgedr_dataops-0.0.32/src/tgedr_dataops.egg-info → tgedr_dataops-0.0.33}/PKG-INFO +1 -1
  2. {tgedr_dataops-0.0.32 → tgedr_dataops-0.0.33}/README.md +3 -2
  3. {tgedr_dataops-0.0.32 → tgedr_dataops-0.0.33}/setup.py +1 -1
  4. {tgedr_dataops-0.0.32 → tgedr_dataops-0.0.33}/src/tgedr/dataops/commons/utils_fs.py +9 -0
  5. tgedr_dataops-0.0.33/src/tgedr/dataops/source/abstract_s3_file_source.py +43 -0
  6. tgedr_dataops-0.0.33/src/tgedr/dataops/source/pd_df_s3_source.py +51 -0
  7. {tgedr_dataops-0.0.32 → tgedr_dataops-0.0.33/src/tgedr_dataops.egg-info}/PKG-INFO +1 -1
  8. {tgedr_dataops-0.0.32 → tgedr_dataops-0.0.33}/src/tgedr_dataops.egg-info/SOURCES.txt +2 -0
  9. {tgedr_dataops-0.0.32 → tgedr_dataops-0.0.33}/LICENSE +0 -0
  10. {tgedr_dataops-0.0.32 → tgedr_dataops-0.0.33}/pyproject.toml +0 -0
  11. {tgedr_dataops-0.0.32 → tgedr_dataops-0.0.33}/setup.cfg +0 -0
  12. {tgedr_dataops-0.0.32 → tgedr_dataops-0.0.33}/src/tgedr/dataops/__init__.py +0 -0
  13. {tgedr_dataops-0.0.32 → tgedr_dataops-0.0.33}/src/tgedr/dataops/chain.py +0 -0
  14. {tgedr_dataops-0.0.32 → tgedr_dataops-0.0.33}/src/tgedr/dataops/commons/dataset.py +0 -0
  15. {tgedr_dataops-0.0.32 → tgedr_dataops-0.0.33}/src/tgedr/dataops/commons/metadata.py +0 -0
  16. {tgedr_dataops-0.0.32 → tgedr_dataops-0.0.33}/src/tgedr/dataops/commons/s3_connector.py +0 -0
  17. {tgedr_dataops-0.0.32 → tgedr_dataops-0.0.33}/src/tgedr/dataops/commons/utils_spark.py +0 -0
  18. {tgedr_dataops-0.0.32 → tgedr_dataops-0.0.33}/src/tgedr/dataops/etl.py +0 -0
  19. {tgedr_dataops-0.0.32 → tgedr_dataops-0.0.33}/src/tgedr/dataops/processor.py +0 -0
  20. {tgedr_dataops-0.0.32 → tgedr_dataops-0.0.33}/src/tgedr/dataops/sink/__init__.py +0 -0
  21. {tgedr_dataops-0.0.32 → tgedr_dataops-0.0.33}/src/tgedr/dataops/sink/local_fs_file_sink.py +0 -0
  22. {tgedr_dataops-0.0.32 → tgedr_dataops-0.0.33}/src/tgedr/dataops/sink/s3_file_sink.py +0 -0
  23. {tgedr_dataops-0.0.32 → tgedr_dataops-0.0.33}/src/tgedr/dataops/sink/sink.py +0 -0
  24. {tgedr_dataops-0.0.32 → tgedr_dataops-0.0.33}/src/tgedr/dataops/source/__init__.py +0 -0
  25. {tgedr_dataops-0.0.32 → tgedr_dataops-0.0.33}/src/tgedr/dataops/source/delta_table_source.py +0 -0
  26. {tgedr_dataops-0.0.32 → tgedr_dataops-0.0.33}/src/tgedr/dataops/source/local_delta_table.py +0 -0
  27. {tgedr_dataops-0.0.32 → tgedr_dataops-0.0.33}/src/tgedr/dataops/source/local_fs_file_source.py +0 -0
  28. {tgedr_dataops-0.0.32 → tgedr_dataops-0.0.33}/src/tgedr/dataops/source/s3_delta_table.py +0 -0
  29. {tgedr_dataops-0.0.32 → tgedr_dataops-0.0.33}/src/tgedr/dataops/source/s3_file_copy.py +0 -0
  30. {tgedr_dataops-0.0.32 → tgedr_dataops-0.0.33}/src/tgedr/dataops/source/s3_file_source.py +0 -0
  31. {tgedr_dataops-0.0.32 → tgedr_dataops-0.0.33}/src/tgedr/dataops/source/source.py +0 -0
  32. {tgedr_dataops-0.0.32 → tgedr_dataops-0.0.33}/src/tgedr/dataops/store/fs_single_partition_parquet.py +0 -0
  33. {tgedr_dataops-0.0.32 → tgedr_dataops-0.0.33}/src/tgedr/dataops/store/local_fs_single_partition_parquet.py +0 -0
  34. {tgedr_dataops-0.0.32 → tgedr_dataops-0.0.33}/src/tgedr/dataops/store/s3_single_partition_parquet.py +0 -0
  35. {tgedr_dataops-0.0.32 → tgedr_dataops-0.0.33}/src/tgedr/dataops/store/spark_delta.py +0 -0
  36. {tgedr_dataops-0.0.32 → tgedr_dataops-0.0.33}/src/tgedr/dataops/store/store.py +0 -0
  37. {tgedr_dataops-0.0.32 → tgedr_dataops-0.0.33}/src/tgedr/dataops/utils_reflection.py +0 -0
  38. {tgedr_dataops-0.0.32 → tgedr_dataops-0.0.33}/src/tgedr/dataops/validation/abs.py +0 -0
  39. {tgedr_dataops-0.0.32 → tgedr_dataops-0.0.33}/src/tgedr/dataops/validation/pandas.py +0 -0
  40. {tgedr_dataops-0.0.32 → tgedr_dataops-0.0.33}/src/tgedr_dataops.egg-info/dependency_links.txt +0 -0
  41. {tgedr_dataops-0.0.32 → tgedr_dataops-0.0.33}/src/tgedr_dataops.egg-info/requires.txt +0 -0
  42. {tgedr_dataops-0.0.32 → tgedr_dataops-0.0.33}/src/tgedr_dataops.egg-info/top_level.txt +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: tgedr-dataops
3
- Version: 0.0.32
3
+ Version: 0.0.33
4
4
  Summary: data operations related code
5
5
  Home-page: https://github.com/jtviegas-sandbox/dataops
6
6
  Author: joao tiago viegas
@@ -29,8 +29,9 @@ data operations related code
29
29
  - __LocalFsFileSource__: __source__ class used to retrieve local objects/files to another local fs location ([example](test/tgedr/dataops/source/test_localfs_file_source.py))
30
30
  - __S3FileSource__: __source__ class used to retrieve objects/files from s3 bucket to local fs location ([example](test/tgedr/dataops/source/test_s3_file_source.py))
31
31
  - __S3FileCopy__: __source__ class used to copy objects/files from an s3 bucket to another s3 bucket ([example](test/tgedr/dataops/source/test_s3_copy.py))
32
- - __S3DeltaTable__: __source__ class used to read delta lake format datasets from s3 bucket with python only, pyspark not needed, returning a pandas dataframe
33
- - __LocalDeltaTable__: __source__ class used to read delta lake format datasets from local fs with python only, pyspark not needed, returning a pandas dataframe
32
+ - __S3DeltaTable__: __source__ class used to read delta lake format datasets from s3 bucket with python only, pyspark not needed, returning a pandas dataframe ([example](test/tgedr/dataops/source/test_s3_delta_table.py))
33
+ - __LocalDeltaTable__: __source__ class used to read delta lake format datasets from local fs with python only, pyspark not needed, returning a pandas dataframe ([example](test/tgedr/dataops/source/test_local_delta_table.py))
34
+ - __PdDfS3Source__: __source__ class used to read a pandas dataframe from, by default, a csv file in s3 ([example](test/tgedr/dataops/source/test_pd_df_s3_source.pypd))
34
35
 
35
36
  #### store
36
37
  - __Store__ : abstract class used to manage persistence, defining CRUD-like (CreateReadUpdateDelete) methods
@@ -4,7 +4,7 @@ import os
4
4
  from setuptools import setup, find_namespace_packages
5
5
 
6
6
  logger = logging.getLogger(__name__)
7
- VERSION = "0.0.32"
7
+ VERSION = "0.0.33"
8
8
  logging.info(f"building version: {VERSION}")
9
9
 
10
10
  setup(
@@ -52,6 +52,15 @@ def process_s3_path(path: str) -> Tuple[str, str]:
52
52
  return (bucket, key)
53
53
 
54
54
 
55
+ def process_s3_url(url: str) -> Tuple[str, str, str]:
56
+ protocol = resolve_s3_protocol(url)
57
+ no_protocol_url = remove_s3_protocol(url)
58
+ path_elements = no_protocol_url.split("/")
59
+ bucket = path_elements[0]
60
+ key = "/".join(path_elements[1:])
61
+ return ("" if protocol is None else protocol, bucket, key)
62
+
63
+
55
64
  def hash_file(filepath, hash_func=hashlib.sha256) -> AnyStr:
56
65
  """Generate a hash for a file.
57
66
 
@@ -0,0 +1,43 @@
1
+ from abc import ABC
2
+ import logging
3
+ from typing import Any, Dict, List, Optional
4
+
5
+ from tgedr.dataops.commons.s3_connector import S3Connector
6
+ from tgedr.dataops.commons.utils_fs import process_s3_url
7
+ from tgedr.dataops.source.source import Source, SourceException
8
+
9
+
10
+ logger = logging.getLogger()
11
+
12
+
13
+ class AbstractS3FileSource(Source, S3Connector, ABC):
14
+ """abstract class used to read file sources from s3"""
15
+
16
+ CONTEXT_KEY_URL = "url"
17
+ CONTEXT_KEY_SUFFIX = "suffix"
18
+
19
+ def __init__(self, config: Optional[Dict[str, Any]] = None):
20
+ Source.__init__(self, config=config)
21
+ S3Connector.__init__(self)
22
+
23
+ def list(self, context: Optional[Dict[str, Any]] = None) -> List[str]:
24
+ logger.info(f"[list|in] ({context})")
25
+
26
+ result: List[str] = []
27
+ if self.CONTEXT_KEY_URL not in context:
28
+ raise SourceException(f"you must provide context for {self.CONTEXT_KEY_URL}")
29
+
30
+ protocol, bucket, key = process_s3_url(context[self.CONTEXT_KEY_URL])
31
+
32
+ objs = self._client.list_objects_v2(Bucket=bucket, Prefix=key)
33
+ result = [
34
+ (protocol + bucket + "/" + entry["Key"]) for entry in objs["Contents"] if not (entry["Key"]).endswith("/")
35
+ ]
36
+
37
+ if self.CONTEXT_KEY_SUFFIX in context:
38
+ suffix: str = context[self.CONTEXT_KEY_SUFFIX]
39
+ result = [f for f in result if f.endswith(suffix)]
40
+
41
+ logger.debug(f"[list|out] => {result}")
42
+ logger.info(f"[list|out] => result len: {len(result)}")
43
+ return result
@@ -0,0 +1,51 @@
1
+ from io import StringIO
2
+ import logging
3
+ from typing import Any, Dict, Optional
4
+ import pandas as pd
5
+
6
+ from tgedr.dataops.commons.utils_fs import process_s3_url
7
+ from tgedr.dataops.source.abstract_s3_file_source import AbstractS3FileSource
8
+ from tgedr.dataops.source.source import SourceException
9
+
10
+ logger = logging.getLogger()
11
+
12
+
13
+ class PdDfS3Source(AbstractS3FileSource):
14
+ """class used to read a pandas dataframe from a csv file in s3"""
15
+
16
+ CONTEXT_KEY_FILE_FORMAT = "file_format"
17
+ CONTEXT_KEY_SEPARATOR = "sep"
18
+ CONTEXT_KEY_NO_HEADER = "no_header"
19
+ CONTEXT_KEY_COLUMN_NAMES = "column_names"
20
+ CONTEXT_KEY_SCHEMA_TYPES = "schema_types"
21
+ DEFAULT_FORMAT = "csv"
22
+ DEFAULT_SEPARATOR = ","
23
+
24
+ def __init__(self, config: Optional[Dict[str, Any]] = None):
25
+ super().__init__(config=config)
26
+
27
+ def get(self, context: Optional[Dict[str, Any]] = None) -> pd.DataFrame:
28
+ """retrieves a pandas dataframe, by default reading it from a csv,
29
+ you can ask for a different format using the context key 'file_format' (available formats: csv)"""
30
+ logger.info(f"[get|in] ({context})")
31
+ result: pd.DataFrame = None
32
+
33
+ if self.CONTEXT_KEY_URL not in context:
34
+ raise SourceException(f"you must provide context for {self.CONTEXT_KEY_URL}")
35
+
36
+ protocol, bucket, key = process_s3_url(context[self.CONTEXT_KEY_URL])
37
+
38
+ obj = self._client.get_object(Bucket=bucket, Key=key)
39
+ data = obj["Body"].read().decode("utf-8")
40
+
41
+ header = 0 if self.CONTEXT_KEY_NO_HEADER not in context else None
42
+ names = None if self.CONTEXT_KEY_COLUMN_NAMES not in context else context[self.CONTEXT_KEY_COLUMN_NAMES]
43
+ dtype = None if self.CONTEXT_KEY_SCHEMA_TYPES not in context else context[self.CONTEXT_KEY_SCHEMA_TYPES]
44
+ sep = (
45
+ self.DEFAULT_SEPARATOR if self.CONTEXT_KEY_SEPARATOR not in context else context[self.CONTEXT_KEY_SEPARATOR]
46
+ )
47
+
48
+ result = pd.read_csv(StringIO(data), sep=sep, header=header, names=names, dtype=dtype)
49
+
50
+ logger.info(f"[get|out] => {result}")
51
+ return result
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: tgedr-dataops
3
- Version: 0.0.32
3
+ Version: 0.0.33
4
4
  Summary: data operations related code
5
5
  Home-page: https://github.com/jtviegas-sandbox/dataops
6
6
  Author: joao tiago viegas
@@ -17,9 +17,11 @@ src/tgedr/dataops/sink/local_fs_file_sink.py
17
17
  src/tgedr/dataops/sink/s3_file_sink.py
18
18
  src/tgedr/dataops/sink/sink.py
19
19
  src/tgedr/dataops/source/__init__.py
20
+ src/tgedr/dataops/source/abstract_s3_file_source.py
20
21
  src/tgedr/dataops/source/delta_table_source.py
21
22
  src/tgedr/dataops/source/local_delta_table.py
22
23
  src/tgedr/dataops/source/local_fs_file_source.py
24
+ src/tgedr/dataops/source/pd_df_s3_source.py
23
25
  src/tgedr/dataops/source/s3_delta_table.py
24
26
  src/tgedr/dataops/source/s3_file_copy.py
25
27
  src/tgedr/dataops/source/s3_file_source.py
File without changes
File without changes