mx-stream-core 0.1.2__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
File without changes
File without changes
@@ -0,0 +1,4 @@
1
+ import os
2
+
3
+ app_name = os.getenv('APP_NAME', 'Mindx CDP')
4
+ service_name = os.getenv('SERVICE_NAME', 'udp')
@@ -0,0 +1,6 @@
1
+ import os
2
+ from mx_stream_core.config.s3 import s3_enable, s3_bucket
3
+
4
+ delta_folder = os.getenv('DELTA_FOLDER', ".delta")
5
+ default_delta_path = f'{s3_bucket}/{delta_folder}' if s3_enable else '/tmp/.delta'
6
+ delta_path = os.getenv('DELTA_PATH', default_delta_path)
@@ -0,0 +1,3 @@
1
+ import os
2
+
3
+ eventstore_connection_string = os.getenv('EVENT_STORE_CONNECTION_STRING', 'esdb://localhost:2113?tls=false')
@@ -0,0 +1,3 @@
1
+ import os
2
+
3
+ kafka_bootstrap_servers = os.getenv('KAFKA_HOST', 'kafka:9092')
@@ -0,0 +1,11 @@
1
+ word = {
2
+ "ingested": 'INGESTED',
3
+ }
4
+ book = {
5
+ "ingested": 'INGESTED',
6
+ }
7
+
8
+ common = {
9
+ "ingested": 'INGESTED',
10
+ "transformed": 'TRANSFORMED',
11
+ }
@@ -0,0 +1,7 @@
1
+ import os
2
+
3
+ s3_access_key = os.getenv('S3_ACCESS_KEY', 'minio')
4
+ s3_secret_key = os.getenv('S3_SECRET_KEY', 'minio123')
5
+ s3_endpoint = os.getenv('S3_ENDPOINT', 'http://localhost:9000')
6
+ s3_bucket = os.getenv('S3_BUCKET', 'cdp')
7
+ s3_enable = os.getenv('S3_ENABLE', 'true')
@@ -0,0 +1,3 @@
1
+ import os
2
+
3
+ master_url = os.getenv('MASTER_URL', "local[*]")
@@ -0,0 +1,2 @@
1
+ word = 'words'
2
+ book = "books"
File without changes
@@ -0,0 +1,8 @@
1
+ from mx_stream_core.config.delta import delta_path
2
+
3
+
4
+ def get_delta_path(table_name=None) -> str:
5
+ if table_name is not None:
6
+ return f"{delta_path}/{table_name}"
7
+ print("path: ", delta_path)
8
+ return delta_path
@@ -0,0 +1,39 @@
1
+ import logging
2
+ import os
3
+ from mx_stream_core.infrastructure.redis import get_stream_position
4
+ from esdbclient import EventStoreDBClient
5
+
6
+ """
7
+ Event Store client
8
+ """
9
+ _client = None
10
+
11
+
12
+ def get_stream(stream_name, group):
13
+ """
14
+ Get stream from event store
15
+ :param stream_name: event store stream name
16
+ :param group: event store group name this group will be used to get the stream position key from redis
17
+ :return:
18
+ """
19
+ stream_position = get_stream_position(stream_name, group)
20
+ logging.info(f"[Event Store] subscribe {stream_name} stream from position {stream_position}")
21
+
22
+ return get_event_store_client().subscribe_to_stream(
23
+ stream_name=stream_name,
24
+ stream_position=stream_position
25
+ )
26
+
27
+
28
+ def get_event_store_client():
29
+ global _client
30
+ if _client is None:
31
+ """
32
+ Event Store connection string
33
+ """
34
+ eventstore_connection_string = os.getenv('EVENT_STORE_CONNECTION_STRING', 'esdb://localhost:2113?tls=false')
35
+
36
+ if eventstore_connection_string is None:
37
+ raise ValueError("EVENT_STORE_CONNECTION_STRING is required")
38
+ _client = EventStoreDBClient(eventstore_connection_string)
39
+ return _client
@@ -0,0 +1,139 @@
1
+ from kafka import KafkaProducer
2
+ from pyspark.sql import DataFrame
3
+ from pyspark.sql.functions import col, from_json, to_json, struct
4
+ from typing_extensions import deprecated
5
+
6
+ from mx_stream_core.config.kafka import kafka_bootstrap_servers
7
+ from mx_stream_core.config.app import service_name
8
+
9
+ # Create Producer instance
10
+ _producer = None
11
+
12
+
13
+ def get_kafka_producer():
14
+ """
15
+ Get a Kafka producer
16
+ :return:
17
+ """
18
+ global _producer
19
+ if _producer is None:
20
+ _producer = KafkaProducer(
21
+ bootstrap_servers=kafka_bootstrap_servers,
22
+ value_serializer=lambda v: str(v).encode('utf-8')
23
+ )
24
+ return _producer
25
+
26
+
27
+ def read_stream_from_kafka(spark, topic, schema) -> DataFrame:
28
+ """
29
+ Read a stream from Kafka
30
+ :param spark: SparkSession
31
+ :param topic: Kafka topic
32
+ :param schema: Schema of the data
33
+ :return:
34
+ """
35
+ df = spark \
36
+ .readStream \
37
+ .format("kafka") \
38
+ .option("kafka.bootstrap.servers", kafka_bootstrap_servers) \
39
+ .option("subscribe", topic) \
40
+ .option("startingOffsets", "earliest") \
41
+ .load()
42
+ df = df.selectExpr("CAST(value AS STRING) as json_str")
43
+ df = df.withColumn("json_data", from_json(col("json_str"), schema)).select("json_data.*")
44
+ return df
45
+
46
+
47
+ def write_stream_to_kafka(df, topic, checkpoint_path):
48
+ """
49
+ Write a stream to Kafka
50
+ :param df: DataFrame
51
+ :param topic: Kafka topic
52
+ :param checkpoint_path: Checkpoint path for the stream
53
+ :return:
54
+ """
55
+ # Convert DataFrame to JSON strings
56
+ json = df.select(to_json(struct(
57
+ *df.columns
58
+ )).alias("value"))
59
+
60
+ # Write the merged stream to Kafka
61
+ query = json \
62
+ .writeStream \
63
+ .format("kafka") \
64
+ .option("kafka.bootstrap.servers", kafka_bootstrap_servers) \
65
+ .option("topic", topic) \
66
+ .option("checkpointLocation", checkpoint_path) \
67
+ .start()
68
+
69
+ return query
70
+
71
+
72
+ def delivery_report(err, msg):
73
+ """
74
+ Reports the result of a message delivery attempt.
75
+ :param err: The error that occurred or None if the message was delivered successfully.
76
+ :param msg: The message that was sent or failed to send.
77
+ :return:
78
+ """
79
+ if err is not None:
80
+ print('Message delivery failed: {}'.format(err))
81
+ else:
82
+ print('Message delivered to {} [{}]'.format(msg.topic(), msg.partition()))
83
+
84
+
85
+ def produce_kafka_message(topic, message):
86
+ """
87
+ Produce a message to Kafka
88
+ :param topic: Kafka topic
89
+ :param message: Message to send
90
+ :return:
91
+ """
92
+ producer = get_kafka_producer()
93
+ producer.send(topic, value=message).add_errback(delivery_report)
94
+ producer.flush() # Wait for all messages to be delivered
95
+
96
+
97
+ def produce_kafka_messages(topic, event_name: str, df: DataFrame):
98
+ """
99
+ Produce a message to Kafka
100
+ :param topic: Kafka topic
101
+ :param message: Message to send
102
+ :return:
103
+ """
104
+ producer = get_kafka_producer()
105
+ for row in df.toLocalIterator():
106
+ message = {
107
+ "event": event_name,
108
+ "data": row.asDict()
109
+ }
110
+ producer.send(topic, value=message).add_errback(delivery_report)
111
+ producer.flush() # Wait for all messages to be delivered
112
+
113
+
114
+ @deprecated("Use create_ingestor_topic_name or create_transformation_topic_name instead")
115
+ def create_topic_name(ingestor_name: str) -> str:
116
+ """
117
+ Create a topic name
118
+ :param ingestor_name: Name of the ingestor (e.g. "word")
119
+ :return:
120
+ """
121
+ return f'{service_name}_ingestor_{ingestor_name}'
122
+
123
+
124
+ def create_ingestor_topic_name(ingestor_name: str) -> str:
125
+ """
126
+ Create a topic name for an ingestor
127
+ :param ingestor_name: Name of the ingestor (e.g. "word")
128
+ :return:
129
+ """
130
+ return f'{service_name}_ingestor_{ingestor_name}'
131
+
132
+
133
+ def create_transformation_topic_name(transformation_name: str) -> str:
134
+ """
135
+ Create a topic name for a transformation
136
+ :param transformation_name: Name of the transformation (e.g. "book")
137
+ :return:
138
+ """
139
+ return f'{service_name}_transformation_{transformation_name}'
@@ -0,0 +1,63 @@
1
+ import logging
2
+
3
+ import redis
4
+ import os
5
+
6
+ from mx_stream_core.config.app import app_name
7
+
8
+ redis_host = os.getenv('REDIS_HOST', 'localhost')
9
+ redis_port = os.getenv('REDIS_PORT', 6379)
10
+ redis_db = os.getenv('REDIS_DB', 0)
11
+ redis_password = os.getenv('REDIS_PASSWORD', None)
12
+
13
+ _r = None
14
+
15
+ """
16
+ Log the Redis connection details
17
+ """
18
+ logging.info(f'[Redis] host: {redis_host}, port: {redis_port}, db: {redis_db}')
19
+
20
+
21
+ def get_redis_client():
22
+ """
23
+ Get a Redis client
24
+ :return:
25
+ """
26
+ global _r
27
+ if _r is None:
28
+ _r = redis.Redis(host=redis_host, port=redis_port, db=redis_db, password=redis_password)
29
+ return _r
30
+
31
+
32
+ def get_stream_key(stream_name, group):
33
+ """
34
+ Get the key for the stream position
35
+ :param stream_name: event store stream name
36
+ :param group: stream group
37
+ :return:
38
+ """
39
+ return "{}_{}_{}_stream_position".format(app_name, group, stream_name)
40
+
41
+
42
+ def get_stream_position(stream_name, group):
43
+ """
44
+ Get the stream position from Redis
45
+ :param stream_name: event store stream name
46
+ :param group: stream group
47
+ :return:
48
+ """
49
+ key = get_stream_key(stream_name, group)
50
+ stream_position = get_redis_client().get(key)
51
+ return int(stream_position) if stream_position else 0
52
+
53
+
54
+ def set_stream_position(stream_name, group, position):
55
+ """
56
+ Set the stream position in Redis
57
+ :param stream_name:
58
+ :param group:
59
+ :param position:
60
+ :return:
61
+ """
62
+ key = get_stream_key(stream_name, group)
63
+ get_redis_client().set(key, position)
@@ -0,0 +1,28 @@
1
+ import os.path
2
+ from pyspark.sql import SparkSession
3
+
4
+ from mx_stream_core.config.app import app_name
5
+ from mx_stream_core.config.s3 import s3_access_key, s3_secret_key, s3_endpoint, s3_enable, s3_bucket
6
+ from mx_stream_core.config.spark import master_url
7
+
8
+ spark_builder = SparkSession.builder.appName(app_name).master(master_url) \
9
+ .config("spark.sql.extensions", "io.delta.sql.DeltaSparkSessionExtension") \
10
+ .config("spark.sql.catalog.spark_catalog", "org.apache.spark.sql.delta.catalog.DeltaCatalog")
11
+
12
+ if s3_enable == 'true':
13
+ spark_builder.config("spark.hadoop.fs.s3a.access.key", s3_access_key) \
14
+ .config("spark.hadoop.fs.s3a.secret.key", s3_secret_key) \
15
+ .config("spark.hadoop.fs.s3a.endpoint", s3_endpoint)
16
+
17
+ spark = spark_builder.getOrCreate()
18
+ spark.sparkContext.setLogLevel(os.getenv('LOG_LEVEL', 'WARN'))
19
+
20
+ checkpoint_folder = os.getenv('CHECKPOINT_FOLDER', ".checkpoints")
21
+ default_root_checkpoint_path = f'{s3_bucket}/{checkpoint_folder}' if s3_enable else '/tmp/.checkpoints'
22
+ root_check_point_path = os.getenv('CHECKPOINT_PATH', default_root_checkpoint_path)
23
+
24
+
25
+ def get_checkpoint_path(table_name=None) -> str:
26
+ if table_name is not None:
27
+ return f"{root_check_point_path}/{table_name}_checkpoint"
28
+ return f"{root_check_point_path}/checkpoint"
File without changes
@@ -0,0 +1,9 @@
1
+ import argparse
2
+
3
+
4
+ def extract_args():
5
+ parser = argparse.ArgumentParser(description="Process some named parameters.")
6
+ parser.add_argument('--worker', type=str, required=False, help='Worker to run')
7
+ parser.add_argument('--layer', type=str, required=False, help='Layer to run')
8
+ args = parser.parse_args()
9
+ return args
@@ -0,0 +1 @@
1
+ OK
@@ -0,0 +1,16 @@
1
+ Metadata-Version: 2.1
2
+ Name: mx_stream_core
3
+ Version: 0.1.2
4
+ Summary: This is package stream core of mindx
5
+ Home-page: http://pypi.private.server/simple/your_package_name/
6
+ Author: MindX
7
+ Author-email: quyentm@mindx.com.vn
8
+ License: LICENSE.txt
9
+ License-File: LICENSE.txt
10
+ Requires-Dist: pyspark ==3.5.1
11
+ Requires-Dist: delta-spark ==3.1.0
12
+ Requires-Dist: redis ==5.0.5
13
+ Requires-Dist: esdbclient ==1.0.25
14
+ Requires-Dist: kafka-python ==2.0.2
15
+
16
+ ## OK
@@ -0,0 +1,23 @@
1
+ mx_stream_core/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
2
+ mx_stream_core/config/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
3
+ mx_stream_core/config/app.py,sha256=I5JT3NV_N2ISOv81tOWHyHtlZqLeOXcQxWuhU34MuXQ,105
4
+ mx_stream_core/config/delta.py,sha256=yjGBnoslG8IW1eljEYaIA0pBujovRzZKEf5tgmqD0w0,260
5
+ mx_stream_core/config/event_store.py,sha256=VySb1807-oLOpLLdrTulISUg5Ec0mD5H-Xp3sfWCYIk,119
6
+ mx_stream_core/config/kafka.py,sha256=PpmGYovIMpesZQnZMbhJZt04dHNaznce58GwHOoXlUM,75
7
+ mx_stream_core/config/layer_events.py,sha256=sk0l9eS3TbnmEGxgJKAom1Zw53JeDeEapkTTyZEZbZY,154
8
+ mx_stream_core/config/s3.py,sha256=Cb4Qi3oQdw1RzHFCILAsdLPe35oBSQ-dpXbSjwe2JtE,267
9
+ mx_stream_core/config/spark.py,sha256=gM2UnnWwsfKgjkBxaDERzY9y4dTnxRJzID4Ioh0tTrA,60
10
+ mx_stream_core/config/streams.py,sha256=_t0jKGUjuT55Pc2_iC_nGRc_NxyMI4kC4DO37USvbeA,30
11
+ mx_stream_core/infrastructure/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
12
+ mx_stream_core/infrastructure/delta.py,sha256=Q_3c1zAvyfJwMz8jYIOU17NI477S8cyu7j5pEoH6nJA,226
13
+ mx_stream_core/infrastructure/event_store.py,sha256=VYXWwTkhVhAFfUhGJPsZRyURaCdGiXdivHJDiyTsKhY,1193
14
+ mx_stream_core/infrastructure/kafka.py,sha256=xPV1BQa62h9j2KHo7h86d7MuSh7vMZOdzwujhf4ae9g,4050
15
+ mx_stream_core/infrastructure/redis.py,sha256=uHiUbwJdW09Hdbm3wZLmpEuB1XFnRm67G6weeTVpBeI,1508
16
+ mx_stream_core/infrastructure/spark.py,sha256=XpBmllXpK2AEiDlng1qnZ-ZpYRAUIMgMJDJ6aH71w0g,1289
17
+ mx_stream_core/utils/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
18
+ mx_stream_core/utils/cmd.py,sha256=KHdIjRpWlFO_m8p9zkawoBNOn8mIyZUL4U3fEsKsrCc,334
19
+ mx_stream_core-0.1.2.dist-info/LICENSE.txt,sha256=VlM5vE0z1ygXtYMCQRLrf1zfPl7vAlLW7BucmpThK7M,2
20
+ mx_stream_core-0.1.2.dist-info/METADATA,sha256=vCyNqeayroM14ypx1DbHvCD6K_vKQYDy9Sfu0NbQ7b8,453
21
+ mx_stream_core-0.1.2.dist-info/WHEEL,sha256=GJ7t_kWBFywbagK5eo9IoUwLW6oyOeTKmQ-9iHFVNxQ,92
22
+ mx_stream_core-0.1.2.dist-info/top_level.txt,sha256=7P7-xBnxxYlylDW1z6XXlCWBZFn_rKCU8ESabosijcw,15
23
+ mx_stream_core-0.1.2.dist-info/RECORD,,
@@ -0,0 +1,5 @@
1
+ Wheel-Version: 1.0
2
+ Generator: bdist_wheel (0.43.0)
3
+ Root-Is-Purelib: true
4
+ Tag: py3-none-any
5
+
@@ -0,0 +1 @@
1
+ mx_stream_core