ingestflow-sdk 1.0.9__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- ingestflow_sdk-1.0.9/PKG-INFO +13 -0
- ingestflow_sdk-1.0.9/ingestflow/__init__.py +23 -0
- ingestflow_sdk-1.0.9/ingestflow_sdk.egg-info/PKG-INFO +13 -0
- ingestflow_sdk-1.0.9/ingestflow_sdk.egg-info/SOURCES.txt +22 -0
- ingestflow_sdk-1.0.9/ingestflow_sdk.egg-info/dependency_links.txt +1 -0
- ingestflow_sdk-1.0.9/ingestflow_sdk.egg-info/requires.txt +3 -0
- ingestflow_sdk-1.0.9/ingestflow_sdk.egg-info/top_level.txt +1 -0
- ingestflow_sdk-1.0.9/pyproject.toml +6 -0
- ingestflow_sdk-1.0.9/setup.cfg +4 -0
- ingestflow_sdk-1.0.9/setup.py +24 -0
- ingestflow_sdk-1.0.9/tests/test_csv_reader.py +11 -0
- ingestflow_sdk-1.0.9/tests/test_data_validator.py +34 -0
- ingestflow_sdk-1.0.9/tests/test_database_connection.py +10 -0
- ingestflow_sdk-1.0.9/tests/test_duplicate_file.py +35 -0
- ingestflow_sdk-1.0.9/tests/test_file_tracker.py +13 -0
- ingestflow_sdk-1.0.9/tests/test_incremental_load.py +54 -0
- ingestflow_sdk-1.0.9/tests/test_json_reader.py +15 -0
- ingestflow_sdk-1.0.9/tests/test_logger.py +12 -0
- ingestflow_sdk-1.0.9/tests/test_merge_load.py +40 -0
- ingestflow_sdk-1.0.9/tests/test_metadata_manager.py +13 -0
- ingestflow_sdk-1.0.9/tests/test_postgres_writer.py +20 -0
- ingestflow_sdk-1.0.9/tests/test_reject_file.py +23 -0
- ingestflow_sdk-1.0.9/tests/test_run_id_generator.py +18 -0
- ingestflow_sdk-1.0.9/tests/test_schema_validator.py +20 -0
|
@@ -0,0 +1,13 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: ingestflow-sdk
|
|
3
|
+
Version: 1.0.9
|
|
4
|
+
Summary: A Python SDK for data ingestion and tracking
|
|
5
|
+
Author: InduPrakash
|
|
6
|
+
Requires-Python: >=3.10
|
|
7
|
+
Requires-Dist: pandas>=2.2.0
|
|
8
|
+
Requires-Dist: psycopg2-binary>=2.9.9
|
|
9
|
+
Requires-Dist: python-dotenv>=1.0.1
|
|
10
|
+
Dynamic: author
|
|
11
|
+
Dynamic: requires-dist
|
|
12
|
+
Dynamic: requires-python
|
|
13
|
+
Dynamic: summary
|
|
@@ -0,0 +1,23 @@
|
|
|
1
|
+
from ingestflow.database.connection import DatabaseConnection
|
|
2
|
+
from ingestflow.trackers.metadata_manager import MetadataManager
|
|
3
|
+
from ingestflow.logging.logger import Logger
|
|
4
|
+
from ingestflow.trackers.file_tracker import FileTracker
|
|
5
|
+
from ingestflow.readers.csv_reader import CsvReader
|
|
6
|
+
from ingestflow.readers.json_reader import JsonReader
|
|
7
|
+
from ingestflow.utils.run_id_generator import RunIdGenerator
|
|
8
|
+
from ingestflow.trackers.record_tracker import RecordTracker
|
|
9
|
+
from ingestflow.writers.postgres_writer import PostgresWriter
|
|
10
|
+
from ingestflow.validators.schema_validator import SchemaValidator
|
|
11
|
+
|
|
12
|
+
__all__ = [
|
|
13
|
+
"DatabaseConnection",
|
|
14
|
+
"MetadataManager",
|
|
15
|
+
"FileTracker",
|
|
16
|
+
"CsvReader",
|
|
17
|
+
"JsonReader",
|
|
18
|
+
"RunIdGenerator",
|
|
19
|
+
"Logger",
|
|
20
|
+
"RecordTracker",
|
|
21
|
+
"PostgresWriter",
|
|
22
|
+
"SchemaValidator",
|
|
23
|
+
]
|
|
@@ -0,0 +1,13 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: ingestflow-sdk
|
|
3
|
+
Version: 1.0.9
|
|
4
|
+
Summary: A Python SDK for data ingestion and tracking
|
|
5
|
+
Author: InduPrakash
|
|
6
|
+
Requires-Python: >=3.10
|
|
7
|
+
Requires-Dist: pandas>=2.2.0
|
|
8
|
+
Requires-Dist: psycopg2-binary>=2.9.9
|
|
9
|
+
Requires-Dist: python-dotenv>=1.0.1
|
|
10
|
+
Dynamic: author
|
|
11
|
+
Dynamic: requires-dist
|
|
12
|
+
Dynamic: requires-python
|
|
13
|
+
Dynamic: summary
|
|
@@ -0,0 +1,22 @@
|
|
|
1
|
+
pyproject.toml
|
|
2
|
+
setup.py
|
|
3
|
+
ingestflow/__init__.py
|
|
4
|
+
ingestflow_sdk.egg-info/PKG-INFO
|
|
5
|
+
ingestflow_sdk.egg-info/SOURCES.txt
|
|
6
|
+
ingestflow_sdk.egg-info/dependency_links.txt
|
|
7
|
+
ingestflow_sdk.egg-info/requires.txt
|
|
8
|
+
ingestflow_sdk.egg-info/top_level.txt
|
|
9
|
+
tests/test_csv_reader.py
|
|
10
|
+
tests/test_data_validator.py
|
|
11
|
+
tests/test_database_connection.py
|
|
12
|
+
tests/test_duplicate_file.py
|
|
13
|
+
tests/test_file_tracker.py
|
|
14
|
+
tests/test_incremental_load.py
|
|
15
|
+
tests/test_json_reader.py
|
|
16
|
+
tests/test_logger.py
|
|
17
|
+
tests/test_merge_load.py
|
|
18
|
+
tests/test_metadata_manager.py
|
|
19
|
+
tests/test_postgres_writer.py
|
|
20
|
+
tests/test_reject_file.py
|
|
21
|
+
tests/test_run_id_generator.py
|
|
22
|
+
tests/test_schema_validator.py
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
ingestflow
|
|
@@ -0,0 +1,24 @@
|
|
|
1
|
+
from setuptools import setup, find_packages
|
|
2
|
+
|
|
3
|
+
setup(
|
|
4
|
+
name="ingestflow-sdk",
|
|
5
|
+
#same name should be given in PyPi account
|
|
6
|
+
version="1.0.9",
|
|
7
|
+
description="A Python SDK for data ingestion and tracking",
|
|
8
|
+
author="InduPrakash",
|
|
9
|
+
packages=find_packages(),
|
|
10
|
+
|
|
11
|
+
# Minimal Python version
|
|
12
|
+
python_requires=">=3.10",
|
|
13
|
+
|
|
14
|
+
# Runtime dependencies
|
|
15
|
+
install_requires=[
|
|
16
|
+
"pandas>=2.2.0",
|
|
17
|
+
"psycopg2-binary>=2.9.9",
|
|
18
|
+
"python-dotenv>=1.0.1"
|
|
19
|
+
],
|
|
20
|
+
|
|
21
|
+
# If you want to include non-python files (README, LICENSE, etc.)
|
|
22
|
+
include_package_data=True,
|
|
23
|
+
|
|
24
|
+
)
|
|
@@ -0,0 +1,34 @@
|
|
|
1
|
+
import pandas as pd
|
|
2
|
+
from ingestflow.validators.data_validator import DataValidator
|
|
3
|
+
|
|
4
|
+
def test_data_validator():
|
|
5
|
+
df = pd.DataFrame(
|
|
6
|
+
{
|
|
7
|
+
"id": [1, None, 3, 4, None],
|
|
8
|
+
"name": [
|
|
9
|
+
"John",
|
|
10
|
+
None,
|
|
11
|
+
"Bob",
|
|
12
|
+
None,
|
|
13
|
+
"Alice"
|
|
14
|
+
]
|
|
15
|
+
}
|
|
16
|
+
)
|
|
17
|
+
valid_df, invalid_df = (
|
|
18
|
+
DataValidator.validate_dataframe(df)
|
|
19
|
+
)
|
|
20
|
+
print("\nVALID DATA")
|
|
21
|
+
print(valid_df)
|
|
22
|
+
print("\nINVALID DATA")
|
|
23
|
+
print(invalid_df)
|
|
24
|
+
|
|
25
|
+
reject_file = (
|
|
26
|
+
DataValidator.generate_reject_file(
|
|
27
|
+
invalid_df,
|
|
28
|
+
"RUN_20260903_001"
|
|
29
|
+
)
|
|
30
|
+
)
|
|
31
|
+
print(reject_file)
|
|
32
|
+
|
|
33
|
+
if __name__ == "__main__":
|
|
34
|
+
test_data_validator()
|
|
@@ -0,0 +1,35 @@
|
|
|
1
|
+
# from ingestflow.trackers.file_tracker import FileTracker
|
|
2
|
+
|
|
3
|
+
# tracker = FileTracker()
|
|
4
|
+
# file_hash = (
|
|
5
|
+
# tracker.get_file_hash(
|
|
6
|
+
# "tests/source_file/employees.csv"
|
|
7
|
+
# )
|
|
8
|
+
# )
|
|
9
|
+
# is_duplicate = (
|
|
10
|
+
# tracker.is_duplicate_file(
|
|
11
|
+
# file_hash
|
|
12
|
+
# )
|
|
13
|
+
# )
|
|
14
|
+
# print(
|
|
15
|
+
# f"Duplicate File : "
|
|
16
|
+
# f"{is_duplicate}"
|
|
17
|
+
# )
|
|
18
|
+
|
|
19
|
+
from ingestflow.trackers.file_tracker import FileTracker
|
|
20
|
+
from ingestflow.trackers.metadata_manager import MetadataManager
|
|
21
|
+
|
|
22
|
+
|
|
23
|
+
def test_duplicate_file():
|
|
24
|
+
metadata = MetadataManager()
|
|
25
|
+
metadata.create_metadata_tables()
|
|
26
|
+
tracker = FileTracker()
|
|
27
|
+
file_hash = tracker.get_file_hash(
|
|
28
|
+
"tests/source_file/employees.csv"
|
|
29
|
+
)
|
|
30
|
+
|
|
31
|
+
result = tracker.is_duplicate_file(
|
|
32
|
+
file_hash
|
|
33
|
+
)
|
|
34
|
+
|
|
35
|
+
assert result in [True, False]
|
|
@@ -0,0 +1,13 @@
|
|
|
1
|
+
from ingestflow.trackers.file_tracker import FileTracker
|
|
2
|
+
|
|
3
|
+
def test_file_hash():
|
|
4
|
+
tracker = FileTracker()
|
|
5
|
+
file_hash = tracker.generate_file_hash(
|
|
6
|
+
"tests/source_file/employees.csv"
|
|
7
|
+
)
|
|
8
|
+
print(
|
|
9
|
+
f"File Hash : {file_hash}"
|
|
10
|
+
)
|
|
11
|
+
|
|
12
|
+
if __name__ == "__main__":
|
|
13
|
+
test_file_hash()
|
|
@@ -0,0 +1,54 @@
|
|
|
1
|
+
# import pandas as pd
|
|
2
|
+
|
|
3
|
+
# from ingestflow.trackers.record_tracker import RecordTracker
|
|
4
|
+
|
|
5
|
+
|
|
6
|
+
# def test_incremental_load():
|
|
7
|
+
|
|
8
|
+
# source_df = pd.DataFrame(
|
|
9
|
+
# {
|
|
10
|
+
# "id": [1, 2, 3],
|
|
11
|
+
# "updated_date": [
|
|
12
|
+
# "2026-01-01",
|
|
13
|
+
# "2026-02-01",
|
|
14
|
+
# "2026-03-01"
|
|
15
|
+
# ]
|
|
16
|
+
# }
|
|
17
|
+
# )
|
|
18
|
+
|
|
19
|
+
# tracker = RecordTracker()
|
|
20
|
+
|
|
21
|
+
# incremental_df = (
|
|
22
|
+
# tracker.get_incremental_records(
|
|
23
|
+
# source_df,
|
|
24
|
+
# "employees",
|
|
25
|
+
# "updated_date"
|
|
26
|
+
# )
|
|
27
|
+
# )
|
|
28
|
+
|
|
29
|
+
# assert incremental_df is not None
|
|
30
|
+
|
|
31
|
+
import pandas as pd
|
|
32
|
+
|
|
33
|
+
from ingestflow.trackers.record_tracker import RecordTracker
|
|
34
|
+
|
|
35
|
+
|
|
36
|
+
def test_incremental_load():
|
|
37
|
+
|
|
38
|
+
source_df = pd.DataFrame(
|
|
39
|
+
{
|
|
40
|
+
"id": [1, 2, 3, 4, 5]
|
|
41
|
+
}
|
|
42
|
+
)
|
|
43
|
+
|
|
44
|
+
tracker = RecordTracker()
|
|
45
|
+
|
|
46
|
+
incremental_df = (
|
|
47
|
+
tracker.get_incremental_records(
|
|
48
|
+
source_df,
|
|
49
|
+
"employees",
|
|
50
|
+
"id"
|
|
51
|
+
)
|
|
52
|
+
)
|
|
53
|
+
|
|
54
|
+
assert incremental_df is not None
|
|
@@ -0,0 +1,12 @@
|
|
|
1
|
+
from ingestflow.logging.logger import Logger
|
|
2
|
+
|
|
3
|
+
summary = {
|
|
4
|
+
"Run ID": "RUN_20260603_001",
|
|
5
|
+
"File Name": "employees.csv",
|
|
6
|
+
"Load Type": "merge",
|
|
7
|
+
"Inserted Records": 10,
|
|
8
|
+
"Updated Records": 2,
|
|
9
|
+
"Status": "SUCCESS"
|
|
10
|
+
}
|
|
11
|
+
|
|
12
|
+
Logger.print_summary(summary)
|
|
@@ -0,0 +1,40 @@
|
|
|
1
|
+
import pandas as pd
|
|
2
|
+
|
|
3
|
+
from ingestflow.trackers.record_tracker import RecordTracker
|
|
4
|
+
|
|
5
|
+
|
|
6
|
+
def test_merge_load():
|
|
7
|
+
|
|
8
|
+
source_df = pd.DataFrame(
|
|
9
|
+
{
|
|
10
|
+
"id": [
|
|
11
|
+
1,
|
|
12
|
+
2,
|
|
13
|
+
3
|
|
14
|
+
],
|
|
15
|
+
"name": [
|
|
16
|
+
"John Updated",
|
|
17
|
+
"Alice",
|
|
18
|
+
"Bob"
|
|
19
|
+
]
|
|
20
|
+
}
|
|
21
|
+
)
|
|
22
|
+
|
|
23
|
+
tracker = RecordTracker()
|
|
24
|
+
|
|
25
|
+
insert_df, update_df = (
|
|
26
|
+
tracker.identify_merge_records(
|
|
27
|
+
source_df,
|
|
28
|
+
"employees",
|
|
29
|
+
["id"]
|
|
30
|
+
)
|
|
31
|
+
)
|
|
32
|
+
|
|
33
|
+
print("\nINSERT RECORDS")
|
|
34
|
+
print(insert_df)
|
|
35
|
+
|
|
36
|
+
print("\nUPDATE RECORDS")
|
|
37
|
+
print(update_df)
|
|
38
|
+
|
|
39
|
+
assert insert_df is not None
|
|
40
|
+
assert update_df is not None
|
|
@@ -0,0 +1,13 @@
|
|
|
1
|
+
from ingestflow.trackers.metadata_manager import MetadataManager
|
|
2
|
+
|
|
3
|
+
|
|
4
|
+
def test_metadata_tables():
|
|
5
|
+
metadata = MetadataManager()
|
|
6
|
+
metadata.create_metadata_tables()
|
|
7
|
+
print(
|
|
8
|
+
"Metadata tables created successfully"
|
|
9
|
+
)
|
|
10
|
+
|
|
11
|
+
if __name__ == "__main__":
|
|
12
|
+
|
|
13
|
+
test_metadata_tables()
|
|
@@ -0,0 +1,20 @@
|
|
|
1
|
+
import pandas as pd
|
|
2
|
+
from ingestflow.writers.postgres_writer import PostgresWriter
|
|
3
|
+
|
|
4
|
+
def test_postgres_writer():
|
|
5
|
+
df = pd.DataFrame(
|
|
6
|
+
{
|
|
7
|
+
"id": [1, 2],
|
|
8
|
+
"name": ["John", "Alice"]
|
|
9
|
+
}
|
|
10
|
+
)
|
|
11
|
+
writer = PostgresWriter()
|
|
12
|
+
writer.create_table_if_not_exists(
|
|
13
|
+
"employees",
|
|
14
|
+
df
|
|
15
|
+
)
|
|
16
|
+
inserted = writer.insert_records(
|
|
17
|
+
"employees",
|
|
18
|
+
df
|
|
19
|
+
)
|
|
20
|
+
assert inserted >= 0
|
|
@@ -0,0 +1,23 @@
|
|
|
1
|
+
import pandas as pd
|
|
2
|
+
from ingestflow.validators.data_validator import DataValidator
|
|
3
|
+
|
|
4
|
+
def test_reject_file():
|
|
5
|
+
invalid_df = pd.DataFrame(
|
|
6
|
+
{
|
|
7
|
+
"id": [2],
|
|
8
|
+
"name": ["Alice"],
|
|
9
|
+
"reject_reason": [
|
|
10
|
+
"Sample Reject"
|
|
11
|
+
]
|
|
12
|
+
}
|
|
13
|
+
)
|
|
14
|
+
reject_file = (
|
|
15
|
+
DataValidator.generate_reject_file(
|
|
16
|
+
invalid_df,
|
|
17
|
+
"RUN_20260603_001"
|
|
18
|
+
)
|
|
19
|
+
)
|
|
20
|
+
print(reject_file)
|
|
21
|
+
|
|
22
|
+
if __name__ == "__main__":
|
|
23
|
+
test_reject_file()
|
|
@@ -0,0 +1,18 @@
|
|
|
1
|
+
from ingestflow.utils.run_id_generator import RunIdGenerator
|
|
2
|
+
|
|
3
|
+
def test_run_id():
|
|
4
|
+
|
|
5
|
+
print(
|
|
6
|
+
RunIdGenerator.generate()
|
|
7
|
+
)
|
|
8
|
+
|
|
9
|
+
print(
|
|
10
|
+
RunIdGenerator.generate()
|
|
11
|
+
)
|
|
12
|
+
|
|
13
|
+
print(
|
|
14
|
+
RunIdGenerator.generate()
|
|
15
|
+
)
|
|
16
|
+
|
|
17
|
+
if __name__ == "__main__":
|
|
18
|
+
test_run_id()
|
|
@@ -0,0 +1,20 @@
|
|
|
1
|
+
import pandas as pd
|
|
2
|
+
from ingestflow.validators.schema_validator import SchemaValidator
|
|
3
|
+
|
|
4
|
+
def test_schema_validator():
|
|
5
|
+
df = pd.DataFrame(
|
|
6
|
+
{
|
|
7
|
+
"id": [1, 2],
|
|
8
|
+
"name": ["John", "Alice"]
|
|
9
|
+
}
|
|
10
|
+
)
|
|
11
|
+
validator = SchemaValidator()
|
|
12
|
+
result = validator.validate_schema(
|
|
13
|
+
dataframe=df,
|
|
14
|
+
table_name="employees",
|
|
15
|
+
schema_mode="fail"
|
|
16
|
+
)
|
|
17
|
+
print(result)
|
|
18
|
+
|
|
19
|
+
if __name__ == "__main__":
|
|
20
|
+
test_schema_validator()
|