mx-stream-core 0.1.2__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- mx_stream_core/__init__.py +0 -0
- mx_stream_core/config/__init__.py +0 -0
- mx_stream_core/config/app.py +4 -0
- mx_stream_core/config/delta.py +6 -0
- mx_stream_core/config/event_store.py +3 -0
- mx_stream_core/config/kafka.py +3 -0
- mx_stream_core/config/layer_events.py +11 -0
- mx_stream_core/config/s3.py +7 -0
- mx_stream_core/config/spark.py +3 -0
- mx_stream_core/config/streams.py +2 -0
- mx_stream_core/infrastructure/__init__.py +0 -0
- mx_stream_core/infrastructure/delta.py +8 -0
- mx_stream_core/infrastructure/event_store.py +39 -0
- mx_stream_core/infrastructure/kafka.py +139 -0
- mx_stream_core/infrastructure/redis.py +63 -0
- mx_stream_core/infrastructure/spark.py +28 -0
- mx_stream_core/utils/__init__.py +0 -0
- mx_stream_core/utils/cmd.py +9 -0
- mx_stream_core-0.1.2.dist-info/LICENSE.txt +1 -0
- mx_stream_core-0.1.2.dist-info/METADATA +16 -0
- mx_stream_core-0.1.2.dist-info/RECORD +23 -0
- mx_stream_core-0.1.2.dist-info/WHEEL +5 -0
- mx_stream_core-0.1.2.dist-info/top_level.txt +1 -0
|
File without changes
|
|
File without changes
|
|
@@ -0,0 +1,7 @@
|
|
|
1
|
+
import os
|
|
2
|
+
|
|
3
|
+
s3_access_key = os.getenv('S3_ACCESS_KEY', 'minio')
|
|
4
|
+
s3_secret_key = os.getenv('S3_SECRET_KEY', 'minio123')
|
|
5
|
+
s3_endpoint = os.getenv('S3_ENDPOINT', 'http://localhost:9000')
|
|
6
|
+
s3_bucket = os.getenv('S3_BUCKET', 'cdp')
|
|
7
|
+
s3_enable = os.getenv('S3_ENABLE', 'true')
|
|
File without changes
|
|
@@ -0,0 +1,39 @@
|
|
|
1
|
+
import logging
|
|
2
|
+
import os
|
|
3
|
+
from mx_stream_core.infrastructure.redis import get_stream_position
|
|
4
|
+
from esdbclient import EventStoreDBClient
|
|
5
|
+
|
|
6
|
+
"""
|
|
7
|
+
Event Store client
|
|
8
|
+
"""
|
|
9
|
+
_client = None
|
|
10
|
+
|
|
11
|
+
|
|
12
|
+
def get_stream(stream_name, group):
|
|
13
|
+
"""
|
|
14
|
+
Get stream from event store
|
|
15
|
+
:param stream_name: event store stream name
|
|
16
|
+
:param group: event store group name this group will be used to get the stream position key from redis
|
|
17
|
+
:return:
|
|
18
|
+
"""
|
|
19
|
+
stream_position = get_stream_position(stream_name, group)
|
|
20
|
+
logging.info(f"[Event Store] subscribe {stream_name} stream from position {stream_position}")
|
|
21
|
+
|
|
22
|
+
return get_event_store_client().subscribe_to_stream(
|
|
23
|
+
stream_name=stream_name,
|
|
24
|
+
stream_position=stream_position
|
|
25
|
+
)
|
|
26
|
+
|
|
27
|
+
|
|
28
|
+
def get_event_store_client():
|
|
29
|
+
global _client
|
|
30
|
+
if _client is None:
|
|
31
|
+
"""
|
|
32
|
+
Event Store connection string
|
|
33
|
+
"""
|
|
34
|
+
eventstore_connection_string = os.getenv('EVENT_STORE_CONNECTION_STRING', 'esdb://localhost:2113?tls=false')
|
|
35
|
+
|
|
36
|
+
if eventstore_connection_string is None:
|
|
37
|
+
raise ValueError("EVENT_STORE_CONNECTION_STRING is required")
|
|
38
|
+
_client = EventStoreDBClient(eventstore_connection_string)
|
|
39
|
+
return _client
|
|
@@ -0,0 +1,139 @@
|
|
|
1
|
+
from kafka import KafkaProducer
|
|
2
|
+
from pyspark.sql import DataFrame
|
|
3
|
+
from pyspark.sql.functions import col, from_json, to_json, struct
|
|
4
|
+
from typing_extensions import deprecated
|
|
5
|
+
|
|
6
|
+
from mx_stream_core.config.kafka import kafka_bootstrap_servers
|
|
7
|
+
from mx_stream_core.config.app import service_name
|
|
8
|
+
|
|
9
|
+
# Create Producer instance
|
|
10
|
+
_producer = None
|
|
11
|
+
|
|
12
|
+
|
|
13
|
+
def get_kafka_producer():
|
|
14
|
+
"""
|
|
15
|
+
Get a Kafka producer
|
|
16
|
+
:return:
|
|
17
|
+
"""
|
|
18
|
+
global _producer
|
|
19
|
+
if _producer is None:
|
|
20
|
+
_producer = KafkaProducer(
|
|
21
|
+
bootstrap_servers=kafka_bootstrap_servers,
|
|
22
|
+
value_serializer=lambda v: str(v).encode('utf-8')
|
|
23
|
+
)
|
|
24
|
+
return _producer
|
|
25
|
+
|
|
26
|
+
|
|
27
|
+
def read_stream_from_kafka(spark, topic, schema) -> DataFrame:
|
|
28
|
+
"""
|
|
29
|
+
Read a stream from Kafka
|
|
30
|
+
:param spark: SparkSession
|
|
31
|
+
:param topic: Kafka topic
|
|
32
|
+
:param schema: Schema of the data
|
|
33
|
+
:return:
|
|
34
|
+
"""
|
|
35
|
+
df = spark \
|
|
36
|
+
.readStream \
|
|
37
|
+
.format("kafka") \
|
|
38
|
+
.option("kafka.bootstrap.servers", kafka_bootstrap_servers) \
|
|
39
|
+
.option("subscribe", topic) \
|
|
40
|
+
.option("startingOffsets", "earliest") \
|
|
41
|
+
.load()
|
|
42
|
+
df = df.selectExpr("CAST(value AS STRING) as json_str")
|
|
43
|
+
df = df.withColumn("json_data", from_json(col("json_str"), schema)).select("json_data.*")
|
|
44
|
+
return df
|
|
45
|
+
|
|
46
|
+
|
|
47
|
+
def write_stream_to_kafka(df, topic, checkpoint_path):
|
|
48
|
+
"""
|
|
49
|
+
Write a stream to Kafka
|
|
50
|
+
:param df: DataFrame
|
|
51
|
+
:param topic: Kafka topic
|
|
52
|
+
:param checkpoint_path: Checkpoint path for the stream
|
|
53
|
+
:return:
|
|
54
|
+
"""
|
|
55
|
+
# Convert DataFrame to JSON strings
|
|
56
|
+
json = df.select(to_json(struct(
|
|
57
|
+
*df.columns
|
|
58
|
+
)).alias("value"))
|
|
59
|
+
|
|
60
|
+
# Write the merged stream to Kafka
|
|
61
|
+
query = json \
|
|
62
|
+
.writeStream \
|
|
63
|
+
.format("kafka") \
|
|
64
|
+
.option("kafka.bootstrap.servers", kafka_bootstrap_servers) \
|
|
65
|
+
.option("topic", topic) \
|
|
66
|
+
.option("checkpointLocation", checkpoint_path) \
|
|
67
|
+
.start()
|
|
68
|
+
|
|
69
|
+
return query
|
|
70
|
+
|
|
71
|
+
|
|
72
|
+
def delivery_report(err, msg):
|
|
73
|
+
"""
|
|
74
|
+
Reports the result of a message delivery attempt.
|
|
75
|
+
:param err: The error that occurred or None if the message was delivered successfully.
|
|
76
|
+
:param msg: The message that was sent or failed to send.
|
|
77
|
+
:return:
|
|
78
|
+
"""
|
|
79
|
+
if err is not None:
|
|
80
|
+
print('Message delivery failed: {}'.format(err))
|
|
81
|
+
else:
|
|
82
|
+
print('Message delivered to {} [{}]'.format(msg.topic(), msg.partition()))
|
|
83
|
+
|
|
84
|
+
|
|
85
|
+
def produce_kafka_message(topic, message):
|
|
86
|
+
"""
|
|
87
|
+
Produce a message to Kafka
|
|
88
|
+
:param topic: Kafka topic
|
|
89
|
+
:param message: Message to send
|
|
90
|
+
:return:
|
|
91
|
+
"""
|
|
92
|
+
producer = get_kafka_producer()
|
|
93
|
+
producer.send(topic, value=message).add_errback(delivery_report)
|
|
94
|
+
producer.flush() # Wait for all messages to be delivered
|
|
95
|
+
|
|
96
|
+
|
|
97
|
+
def produce_kafka_messages(topic, event_name: str, df: DataFrame):
|
|
98
|
+
"""
|
|
99
|
+
Produce a message to Kafka
|
|
100
|
+
:param topic: Kafka topic
|
|
101
|
+
:param message: Message to send
|
|
102
|
+
:return:
|
|
103
|
+
"""
|
|
104
|
+
producer = get_kafka_producer()
|
|
105
|
+
for row in df.toLocalIterator():
|
|
106
|
+
message = {
|
|
107
|
+
"event": event_name,
|
|
108
|
+
"data": row.asDict()
|
|
109
|
+
}
|
|
110
|
+
producer.send(topic, value=message).add_errback(delivery_report)
|
|
111
|
+
producer.flush() # Wait for all messages to be delivered
|
|
112
|
+
|
|
113
|
+
|
|
114
|
+
@deprecated("Use create_ingestor_topic_name or create_transformation_topic_name instead")
|
|
115
|
+
def create_topic_name(ingestor_name: str) -> str:
|
|
116
|
+
"""
|
|
117
|
+
Create a topic name
|
|
118
|
+
:param ingestor_name: Name of the ingestor (e.g. "word")
|
|
119
|
+
:return:
|
|
120
|
+
"""
|
|
121
|
+
return f'{service_name}_ingestor_{ingestor_name}'
|
|
122
|
+
|
|
123
|
+
|
|
124
|
+
def create_ingestor_topic_name(ingestor_name: str) -> str:
|
|
125
|
+
"""
|
|
126
|
+
Create a topic name for an ingestor
|
|
127
|
+
:param ingestor_name: Name of the ingestor (e.g. "word")
|
|
128
|
+
:return:
|
|
129
|
+
"""
|
|
130
|
+
return f'{service_name}_ingestor_{ingestor_name}'
|
|
131
|
+
|
|
132
|
+
|
|
133
|
+
def create_transformation_topic_name(transformation_name: str) -> str:
|
|
134
|
+
"""
|
|
135
|
+
Create a topic name for a transformation
|
|
136
|
+
:param transformation_name: Name of the transformation (e.g. "book")
|
|
137
|
+
:return:
|
|
138
|
+
"""
|
|
139
|
+
return f'{service_name}_transformation_{transformation_name}'
|
|
@@ -0,0 +1,63 @@
|
|
|
1
|
+
import logging
|
|
2
|
+
|
|
3
|
+
import redis
|
|
4
|
+
import os
|
|
5
|
+
|
|
6
|
+
from mx_stream_core.config.app import app_name
|
|
7
|
+
|
|
8
|
+
redis_host = os.getenv('REDIS_HOST', 'localhost')
|
|
9
|
+
redis_port = os.getenv('REDIS_PORT', 6379)
|
|
10
|
+
redis_db = os.getenv('REDIS_DB', 0)
|
|
11
|
+
redis_password = os.getenv('REDIS_PASSWORD', None)
|
|
12
|
+
|
|
13
|
+
_r = None
|
|
14
|
+
|
|
15
|
+
"""
|
|
16
|
+
Log the Redis connection details
|
|
17
|
+
"""
|
|
18
|
+
logging.info(f'[Redis] host: {redis_host}, port: {redis_port}, db: {redis_db}')
|
|
19
|
+
|
|
20
|
+
|
|
21
|
+
def get_redis_client():
|
|
22
|
+
"""
|
|
23
|
+
Get a Redis client
|
|
24
|
+
:return:
|
|
25
|
+
"""
|
|
26
|
+
global _r
|
|
27
|
+
if _r is None:
|
|
28
|
+
_r = redis.Redis(host=redis_host, port=redis_port, db=redis_db, password=redis_password)
|
|
29
|
+
return _r
|
|
30
|
+
|
|
31
|
+
|
|
32
|
+
def get_stream_key(stream_name, group):
|
|
33
|
+
"""
|
|
34
|
+
Get the key for the stream position
|
|
35
|
+
:param stream_name: event store stream name
|
|
36
|
+
:param group: stream group
|
|
37
|
+
:return:
|
|
38
|
+
"""
|
|
39
|
+
return "{}_{}_{}_stream_position".format(app_name, group, stream_name)
|
|
40
|
+
|
|
41
|
+
|
|
42
|
+
def get_stream_position(stream_name, group):
|
|
43
|
+
"""
|
|
44
|
+
Get the stream position from Redis
|
|
45
|
+
:param stream_name: event store stream name
|
|
46
|
+
:param group: stream group
|
|
47
|
+
:return:
|
|
48
|
+
"""
|
|
49
|
+
key = get_stream_key(stream_name, group)
|
|
50
|
+
stream_position = get_redis_client().get(key)
|
|
51
|
+
return int(stream_position) if stream_position else 0
|
|
52
|
+
|
|
53
|
+
|
|
54
|
+
def set_stream_position(stream_name, group, position):
|
|
55
|
+
"""
|
|
56
|
+
Set the stream position in Redis
|
|
57
|
+
:param stream_name:
|
|
58
|
+
:param group:
|
|
59
|
+
:param position:
|
|
60
|
+
:return:
|
|
61
|
+
"""
|
|
62
|
+
key = get_stream_key(stream_name, group)
|
|
63
|
+
get_redis_client().set(key, position)
|
|
@@ -0,0 +1,28 @@
|
|
|
1
|
+
import os.path
|
|
2
|
+
from pyspark.sql import SparkSession
|
|
3
|
+
|
|
4
|
+
from mx_stream_core.config.app import app_name
|
|
5
|
+
from mx_stream_core.config.s3 import s3_access_key, s3_secret_key, s3_endpoint, s3_enable, s3_bucket
|
|
6
|
+
from mx_stream_core.config.spark import master_url
|
|
7
|
+
|
|
8
|
+
spark_builder = SparkSession.builder.appName(app_name).master(master_url) \
|
|
9
|
+
.config("spark.sql.extensions", "io.delta.sql.DeltaSparkSessionExtension") \
|
|
10
|
+
.config("spark.sql.catalog.spark_catalog", "org.apache.spark.sql.delta.catalog.DeltaCatalog")
|
|
11
|
+
|
|
12
|
+
if s3_enable == 'true':
|
|
13
|
+
spark_builder.config("spark.hadoop.fs.s3a.access.key", s3_access_key) \
|
|
14
|
+
.config("spark.hadoop.fs.s3a.secret.key", s3_secret_key) \
|
|
15
|
+
.config("spark.hadoop.fs.s3a.endpoint", s3_endpoint)
|
|
16
|
+
|
|
17
|
+
spark = spark_builder.getOrCreate()
|
|
18
|
+
spark.sparkContext.setLogLevel(os.getenv('LOG_LEVEL', 'WARN'))
|
|
19
|
+
|
|
20
|
+
checkpoint_folder = os.getenv('CHECKPOINT_FOLDER', ".checkpoints")
|
|
21
|
+
default_root_checkpoint_path = f'{s3_bucket}/{checkpoint_folder}' if s3_enable else '/tmp/.checkpoints'
|
|
22
|
+
root_check_point_path = os.getenv('CHECKPOINT_PATH', default_root_checkpoint_path)
|
|
23
|
+
|
|
24
|
+
|
|
25
|
+
def get_checkpoint_path(table_name=None) -> str:
|
|
26
|
+
if table_name is not None:
|
|
27
|
+
return f"{root_check_point_path}/{table_name}_checkpoint"
|
|
28
|
+
return f"{root_check_point_path}/checkpoint"
|
|
File without changes
|
|
@@ -0,0 +1,9 @@
|
|
|
1
|
+
import argparse
|
|
2
|
+
|
|
3
|
+
|
|
4
|
+
def extract_args():
|
|
5
|
+
parser = argparse.ArgumentParser(description="Process some named parameters.")
|
|
6
|
+
parser.add_argument('--worker', type=str, required=False, help='Worker to run')
|
|
7
|
+
parser.add_argument('--layer', type=str, required=False, help='Layer to run')
|
|
8
|
+
args = parser.parse_args()
|
|
9
|
+
return args
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
OK
|
|
@@ -0,0 +1,16 @@
|
|
|
1
|
+
Metadata-Version: 2.1
|
|
2
|
+
Name: mx_stream_core
|
|
3
|
+
Version: 0.1.2
|
|
4
|
+
Summary: This is package stream core of mindx
|
|
5
|
+
Home-page: http://pypi.private.server/simple/your_package_name/
|
|
6
|
+
Author: MindX
|
|
7
|
+
Author-email: quyentm@mindx.com.vn
|
|
8
|
+
License: LICENSE.txt
|
|
9
|
+
License-File: LICENSE.txt
|
|
10
|
+
Requires-Dist: pyspark ==3.5.1
|
|
11
|
+
Requires-Dist: delta-spark ==3.1.0
|
|
12
|
+
Requires-Dist: redis ==5.0.5
|
|
13
|
+
Requires-Dist: esdbclient ==1.0.25
|
|
14
|
+
Requires-Dist: kafka-python ==2.0.2
|
|
15
|
+
|
|
16
|
+
## OK
|
|
@@ -0,0 +1,23 @@
|
|
|
1
|
+
mx_stream_core/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
2
|
+
mx_stream_core/config/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
3
|
+
mx_stream_core/config/app.py,sha256=I5JT3NV_N2ISOv81tOWHyHtlZqLeOXcQxWuhU34MuXQ,105
|
|
4
|
+
mx_stream_core/config/delta.py,sha256=yjGBnoslG8IW1eljEYaIA0pBujovRzZKEf5tgmqD0w0,260
|
|
5
|
+
mx_stream_core/config/event_store.py,sha256=VySb1807-oLOpLLdrTulISUg5Ec0mD5H-Xp3sfWCYIk,119
|
|
6
|
+
mx_stream_core/config/kafka.py,sha256=PpmGYovIMpesZQnZMbhJZt04dHNaznce58GwHOoXlUM,75
|
|
7
|
+
mx_stream_core/config/layer_events.py,sha256=sk0l9eS3TbnmEGxgJKAom1Zw53JeDeEapkTTyZEZbZY,154
|
|
8
|
+
mx_stream_core/config/s3.py,sha256=Cb4Qi3oQdw1RzHFCILAsdLPe35oBSQ-dpXbSjwe2JtE,267
|
|
9
|
+
mx_stream_core/config/spark.py,sha256=gM2UnnWwsfKgjkBxaDERzY9y4dTnxRJzID4Ioh0tTrA,60
|
|
10
|
+
mx_stream_core/config/streams.py,sha256=_t0jKGUjuT55Pc2_iC_nGRc_NxyMI4kC4DO37USvbeA,30
|
|
11
|
+
mx_stream_core/infrastructure/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
12
|
+
mx_stream_core/infrastructure/delta.py,sha256=Q_3c1zAvyfJwMz8jYIOU17NI477S8cyu7j5pEoH6nJA,226
|
|
13
|
+
mx_stream_core/infrastructure/event_store.py,sha256=VYXWwTkhVhAFfUhGJPsZRyURaCdGiXdivHJDiyTsKhY,1193
|
|
14
|
+
mx_stream_core/infrastructure/kafka.py,sha256=xPV1BQa62h9j2KHo7h86d7MuSh7vMZOdzwujhf4ae9g,4050
|
|
15
|
+
mx_stream_core/infrastructure/redis.py,sha256=uHiUbwJdW09Hdbm3wZLmpEuB1XFnRm67G6weeTVpBeI,1508
|
|
16
|
+
mx_stream_core/infrastructure/spark.py,sha256=XpBmllXpK2AEiDlng1qnZ-ZpYRAUIMgMJDJ6aH71w0g,1289
|
|
17
|
+
mx_stream_core/utils/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
18
|
+
mx_stream_core/utils/cmd.py,sha256=KHdIjRpWlFO_m8p9zkawoBNOn8mIyZUL4U3fEsKsrCc,334
|
|
19
|
+
mx_stream_core-0.1.2.dist-info/LICENSE.txt,sha256=VlM5vE0z1ygXtYMCQRLrf1zfPl7vAlLW7BucmpThK7M,2
|
|
20
|
+
mx_stream_core-0.1.2.dist-info/METADATA,sha256=vCyNqeayroM14ypx1DbHvCD6K_vKQYDy9Sfu0NbQ7b8,453
|
|
21
|
+
mx_stream_core-0.1.2.dist-info/WHEEL,sha256=GJ7t_kWBFywbagK5eo9IoUwLW6oyOeTKmQ-9iHFVNxQ,92
|
|
22
|
+
mx_stream_core-0.1.2.dist-info/top_level.txt,sha256=7P7-xBnxxYlylDW1z6XXlCWBZFn_rKCU8ESabosijcw,15
|
|
23
|
+
mx_stream_core-0.1.2.dist-info/RECORD,,
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
mx_stream_core
|