tensorwatch-api 0.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- tensorwatch_api-0.1.dist-info/METADATA +8 -0
- tensorwatch_api-0.1.dist-info/RECORD +10 -0
- tensorwatch_api-0.1.dist-info/WHEEL +5 -0
- tensorwatch_api-0.1.dist-info/entry_points.txt +2 -0
- tensorwatch_api-0.1.dist-info/top_level.txt +1 -0
- twapi/Example_Senter.py +51 -0
- twapi/__init__.py +1 -0
- twapi/kafka_connector.py +255 -0
- twapi/pykafka_connector.py +293 -0
- twapi/twapi.py +197 -0
@@ -0,0 +1,10 @@
|
|
1
|
+
twapi/Example_Senter.py,sha256=jPKi7f9teidzXKq-yXMW0sa2yzYGCob3WIHVi3J1uNA,1532
|
2
|
+
twapi/__init__.py,sha256=wYkftb8fUjtf62Qs3yXegUNkMn4bSTaSWLyRCvv4yt0,34
|
3
|
+
twapi/kafka_connector.py,sha256=RUcK9MYnfaMDO47QQC9-_SyqahRZmmoBtBwCSetAVFs,10829
|
4
|
+
twapi/pykafka_connector.py,sha256=7e5fHEZyqJo1a43mHTpqmscbQJZb7CjZhpN8kyRw-ok,12977
|
5
|
+
twapi/twapi.py,sha256=NkfcTz6u-YuJyCc9NsjwYuH6A2Pzkhn7lQVxZf0aXJQ,9158
|
6
|
+
tensorwatch_api-0.1.dist-info/METADATA,sha256=vdsLmWDb-RRbGvXv7fdc1ajqJoUQwFTydttNhlMqdg4,189
|
7
|
+
tensorwatch_api-0.1.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
|
8
|
+
tensorwatch_api-0.1.dist-info/entry_points.txt,sha256=xIxCPp_fRVck1fVJE1kqhxAEVEycrxOk8TM48EXeHWs,59
|
9
|
+
tensorwatch_api-0.1.dist-info/top_level.txt,sha256=E48dj13nSJGzHKjYnAoCGtiUjM9AU2LZk3vvmdWtNhM,6
|
10
|
+
tensorwatch_api-0.1.dist-info/RECORD,,
|
@@ -0,0 +1 @@
|
|
1
|
+
twapi
|
twapi/Example_Senter.py
ADDED
@@ -0,0 +1,51 @@
|
|
1
|
+
from pykafka import KafkaClient
|
2
|
+
import json
|
3
|
+
import time
|
4
|
+
import random
|
5
|
+
import sys
|
6
|
+
|
7
|
+
def main():
|
8
|
+
"""
|
9
|
+
Benchmark producer to test throughput and latency.
|
10
|
+
"""
|
11
|
+
try:
|
12
|
+
client = KafkaClient(hosts="127.0.0.1:9093")
|
13
|
+
topic = client.topics['gemini2']
|
14
|
+
except Exception as e:
|
15
|
+
print(f"Failed to connect to Kafka: {e}")
|
16
|
+
sys.exit(1)
|
17
|
+
|
18
|
+
parse_type = "json"
|
19
|
+
num_messages = 200000 # Number of messages to send
|
20
|
+
|
21
|
+
with topic.get_sync_producer() as producer:
|
22
|
+
print("Starting benchmark...")
|
23
|
+
print(f"Sending {num_messages} messages to topic '{topic.name.decode()}'...")
|
24
|
+
|
25
|
+
start_time = time.time()
|
26
|
+
|
27
|
+
for i in range(num_messages):
|
28
|
+
message = {
|
29
|
+
'seq': i,
|
30
|
+
'send_time': time.time(),
|
31
|
+
'data': random.randint(0, 1000)
|
32
|
+
}
|
33
|
+
|
34
|
+
if parse_type == "json":
|
35
|
+
data = json.dumps(message)
|
36
|
+
producer.produce(data.encode('utf-8'))
|
37
|
+
|
38
|
+
if (i + 1) % 1000 == 0:
|
39
|
+
print(f"Sent {i + 1}/{num_messages} messages...")
|
40
|
+
|
41
|
+
end_time = time.time()
|
42
|
+
duration = end_time - start_time
|
43
|
+
throughput = num_messages / duration if duration > 0 else float('inf')
|
44
|
+
|
45
|
+
print("\n--- BENCHMARK SUMMARY ---")
|
46
|
+
print(f"Sent {num_messages} messages in {duration:.2f} seconds.")
|
47
|
+
print(f"Producer throughput: {throughput:.2f} messages/sec.")
|
48
|
+
print("------------------------")
|
49
|
+
|
50
|
+
if __name__ == "__main__":
|
51
|
+
main()
|
twapi/__init__.py
ADDED
@@ -0,0 +1 @@
|
|
1
|
+
from .tensorwatch-api import twapi
|
twapi/kafka_connector.py
ADDED
@@ -0,0 +1,255 @@
|
|
1
|
+
import time
|
2
|
+
import threading
|
3
|
+
from queue import Queue
|
4
|
+
import random
|
5
|
+
import json
|
6
|
+
import pickle
|
7
|
+
from typing import Dict
|
8
|
+
from confluent_kafka import Consumer
|
9
|
+
from tensorwatch import Watcher
|
10
|
+
from probables import CountMinSketch
|
11
|
+
import logging
|
12
|
+
|
13
|
+
# Optional Parsers
|
14
|
+
try:
|
15
|
+
import xmltodict
|
16
|
+
except ImportError:
|
17
|
+
xmltodict = None
|
18
|
+
|
19
|
+
try:
|
20
|
+
import avro.schema
|
21
|
+
from avro.io import DatumReader, BinaryDecoder
|
22
|
+
import io
|
23
|
+
except ImportError:
|
24
|
+
avro = None
|
25
|
+
|
26
|
+
try:
|
27
|
+
from protobuf_to_dict import protobuf_to_dict
|
28
|
+
from google.protobuf import message
|
29
|
+
except ImportError:
|
30
|
+
protobuf_to_dict = None
|
31
|
+
|
32
|
+
class KafkaConnector(threading.Thread):
|
33
|
+
"""
|
34
|
+
A Kafka consumer that runs in a separate thread to consume messages from a Kafka topic.
|
35
|
+
It supports various message formats and integrates with TensorWatch for real-time data visualization.
|
36
|
+
"""
|
37
|
+
def __init__(self, hosts="localhost:9092", topic=None, parsetype=None, avro_schema=None, queue_length=50000,
|
38
|
+
cluster_size=1, consumer_config=None, poll=1.0, auto_offset="earliest", group_id="mygroup",
|
39
|
+
decode="utf-8", schema_path=None, protobuf_message=None, random_sampling=None, countmin_width=None,
|
40
|
+
countmin_depth=None, twapi_instance=None):
|
41
|
+
"""
|
42
|
+
Initializes the KafkaConnector.
|
43
|
+
|
44
|
+
Args:
|
45
|
+
hosts (str): Comma-separated list of Kafka brokers.
|
46
|
+
topic (str): The Kafka topic to consume from.
|
47
|
+
parsetype (str): The format of the messages (e.g., "json", "pickle", "xml", "avro", "protobuf").
|
48
|
+
avro_schema (str): The Avro schema for message deserialization.
|
49
|
+
queue_length (int): The maximum number of messages to store in the internal queue.
|
50
|
+
cluster_size (int): The number of consumer threads to run.
|
51
|
+
consumer_config (dict): A dictionary of Kafka consumer configuration settings.
|
52
|
+
poll (float): The timeout for polling for new messages from Kafka.
|
53
|
+
auto_offset (str): The offset reset policy.
|
54
|
+
group_id (str): The consumer group ID.
|
55
|
+
decode (str): The encoding to use for decoding messages.
|
56
|
+
schema_path (str): The path to the Avro schema file.
|
57
|
+
protobuf_message (str): The name of the Protobuf message class.
|
58
|
+
random_sampling (int): The percentage of messages to sample (0-100).
|
59
|
+
countmin_width (int): The width of the Count-Min Sketch.
|
60
|
+
countmin_depth (int): The depth of the Count-Min Sketch.
|
61
|
+
twapi_instance: An instance of the TensorWatch API for updating metrics.
|
62
|
+
"""
|
63
|
+
super().__init__()
|
64
|
+
self.hosts = hosts or "localhost:9092"
|
65
|
+
self.topic = topic
|
66
|
+
self.cluster_size = cluster_size
|
67
|
+
self.decode = decode
|
68
|
+
self.parsetype = parsetype
|
69
|
+
self.protobuf_message = protobuf_message
|
70
|
+
self.queue_length = queue_length
|
71
|
+
self.data = Queue(maxsize=queue_length)
|
72
|
+
self.cms = {} # Count-Min Sketch table
|
73
|
+
self.countmin_width = countmin_width
|
74
|
+
self.countmin_depth = countmin_depth
|
75
|
+
self.random_sampling = random_sampling
|
76
|
+
self.poll = poll
|
77
|
+
self.consumer_config = consumer_config or {
|
78
|
+
"bootstrap.servers": self.hosts,
|
79
|
+
"group.id": group_id,
|
80
|
+
"auto.offset.reset": auto_offset,
|
81
|
+
}
|
82
|
+
self._quit = threading.Event()
|
83
|
+
self.size = 0
|
84
|
+
self.watcher = Watcher()
|
85
|
+
self.schema = None
|
86
|
+
self.reader = None
|
87
|
+
|
88
|
+
self.twapi_instance = twapi_instance
|
89
|
+
self.latencies = []
|
90
|
+
self.received_count = 0
|
91
|
+
self.last_report_time = time.time()
|
92
|
+
self.first_message_sent = False
|
93
|
+
|
94
|
+
# Load Avro Schema if needed
|
95
|
+
if parsetype == "avro" and avro:
|
96
|
+
try:
|
97
|
+
self.schema = avro.schema.parse(avro_schema)
|
98
|
+
self.reader = DatumReader(self.schema)
|
99
|
+
except Exception as e:
|
100
|
+
logging.error(f"Avro Schema Error: {e}, Avro may not work")
|
101
|
+
print(f"Avro Schema Error: {e}, Avro may not work")
|
102
|
+
return
|
103
|
+
|
104
|
+
# Load Protobuf if needed
|
105
|
+
if parsetype == "protobuf" and protobuf_to_dict:
|
106
|
+
try:
|
107
|
+
import importlib
|
108
|
+
module = importlib.import_module(protobuf_message)
|
109
|
+
self.protobuf_class = getattr(module, protobuf_message)
|
110
|
+
|
111
|
+
except Exception as e:
|
112
|
+
logging.error(f"Protobuf Import Error: {e}")
|
113
|
+
print(f"Protobuf Import Error: {e}")
|
114
|
+
self.protobuf_class = None
|
115
|
+
|
116
|
+
self.start()
|
117
|
+
|
118
|
+
def myparser(self, message):
|
119
|
+
"""
|
120
|
+
Parses a message based on the specified format.
|
121
|
+
|
122
|
+
Args:
|
123
|
+
message: The message to parse.
|
124
|
+
|
125
|
+
Returns:
|
126
|
+
The parsed message, or None if parsing fails.
|
127
|
+
"""
|
128
|
+
try:
|
129
|
+
if self.parsetype is None or self.parsetype.lower() == "json":
|
130
|
+
return json.loads(message)
|
131
|
+
elif self.parsetype.lower() == "pickle":
|
132
|
+
return pickle.loads(message)
|
133
|
+
elif self.parsetype.lower() == "xml" and xmltodict:
|
134
|
+
return xmltodict.parse(message)["root"]
|
135
|
+
elif self.parsetype.lower() == "protobuf" and protobuf_to_dict:
|
136
|
+
if self.protobuf_class:
|
137
|
+
dynamic_message = self.protobuf_class()
|
138
|
+
dynamic_message.ParseFromString(message)
|
139
|
+
return protobuf_to_dict(dynamic_message)
|
140
|
+
elif self.parsetype.lower() == "avro" and avro:
|
141
|
+
decoder = BinaryDecoder(io.BytesIO(message))
|
142
|
+
return self.reader.read(decoder)
|
143
|
+
except Exception as e:
|
144
|
+
logging.error(f"Parsing Error ({self.parsetype}): {e}")
|
145
|
+
print(f"Parsing Error ({self.parsetype}): {e}")
|
146
|
+
return None
|
147
|
+
|
148
|
+
def process_message(self, msg):
|
149
|
+
"""
|
150
|
+
Processes a single message from Kafka. This includes parsing, calculating latency,
|
151
|
+
and adding the message to the data queue.
|
152
|
+
"""
|
153
|
+
receive_time = time.time()
|
154
|
+
try:
|
155
|
+
# Apply random sampling if configured
|
156
|
+
if self.random_sampling and self.random_sampling > random.randint(0, 100):
|
157
|
+
return
|
158
|
+
|
159
|
+
message = msg.value().decode(self.decode)
|
160
|
+
parsed_message = self.myparser(message)
|
161
|
+
|
162
|
+
# Calculate and record latency if send_time is in the message
|
163
|
+
if parsed_message and isinstance(parsed_message, dict) and 'send_time' in parsed_message:
|
164
|
+
self.received_count += 1
|
165
|
+
send_time = parsed_message['send_time']
|
166
|
+
latency = receive_time - send_time
|
167
|
+
self.latencies.append(latency)
|
168
|
+
parsed_message['latency'] = latency
|
169
|
+
parsed_message['receive_time'] = receive_time
|
170
|
+
|
171
|
+
# Add the parsed message to the queue if it's not full
|
172
|
+
if parsed_message and not self.data.full():
|
173
|
+
self.data.put(parsed_message, block=False)
|
174
|
+
# Notify the twapi_instance on the first message
|
175
|
+
if not self.first_message_sent and self.twapi_instance:
|
176
|
+
logging.info("First message received, enabling apply button.")
|
177
|
+
self.twapi_instance.enable_apply_button()
|
178
|
+
self.first_message_sent = True
|
179
|
+
elif self.data.full():
|
180
|
+
logging.warning("Queue is full, dropping message.")
|
181
|
+
|
182
|
+
# Update Count-Min Sketch if configured
|
183
|
+
if isinstance(parsed_message, dict) and self.countmin_width and self.countmin_depth:
|
184
|
+
for key, value in parsed_message.items():
|
185
|
+
self.cms.setdefault(key, CountMinSketch(width=self.countmin_width, depth=self.countmin_depth))
|
186
|
+
self.cms[key].add(str(value))
|
187
|
+
|
188
|
+
self.size += 1
|
189
|
+
except Exception as e:
|
190
|
+
logging.error(f"Message Processing Error: {e}, Message: {message}")
|
191
|
+
print(f"Message Processing Error: {e}, Message: {message}")
|
192
|
+
|
193
|
+
def consumer_loop(self):
|
194
|
+
"""
|
195
|
+
The main loop for the Kafka consumer. It polls for messages, processes them,
|
196
|
+
and handles errors.
|
197
|
+
"""
|
198
|
+
logging.info(f"Starting consumer loop for topic '{self.topic}'")
|
199
|
+
consumer = Consumer(self.consumer_config)
|
200
|
+
consumer.subscribe([self.topic])
|
201
|
+
|
202
|
+
while not self._quit.is_set():
|
203
|
+
msg = consumer.poll(self.poll)
|
204
|
+
if msg and not msg.error():
|
205
|
+
self.process_message(msg)
|
206
|
+
elif msg and msg.error():
|
207
|
+
logging.error(f"Kafka Error: {msg.error()}")
|
208
|
+
print(f"Kafka Error: {msg.error()}")
|
209
|
+
|
210
|
+
consumer.close()
|
211
|
+
logging.info("Consumer loop stopped")
|
212
|
+
|
213
|
+
def run(self):
|
214
|
+
"""
|
215
|
+
Starts the consumer threads and the main watcher loop.
|
216
|
+
"""
|
217
|
+
logging.info(f"Starting {self.cluster_size} consumer threads")
|
218
|
+
threads = [threading.Thread(target=self.consumer_loop, daemon=True) for _ in range(self.cluster_size)]
|
219
|
+
for thread in threads:
|
220
|
+
thread.start()
|
221
|
+
|
222
|
+
while not self._quit.is_set():
|
223
|
+
# Observe the data queue with TensorWatch
|
224
|
+
if not self.data.empty():
|
225
|
+
self.watcher.observe(data=list(self.data.queue), size=self.size, cms=self.cms)
|
226
|
+
|
227
|
+
# --- BENCHMARK REPORTING ---
|
228
|
+
current_time = time.time()
|
229
|
+
if current_time - self.last_report_time > 5.0: # Report every 5 seconds
|
230
|
+
if self.latencies:
|
231
|
+
avg_latency = sum(self.latencies) / len(self.latencies)
|
232
|
+
max_latency = max(self.latencies)
|
233
|
+
min_latency = min(self.latencies)
|
234
|
+
|
235
|
+
time_since_last_report = current_time - self.last_report_time
|
236
|
+
throughput = self.received_count / time_since_last_report if time_since_last_report > 0 else 0
|
237
|
+
|
238
|
+
stats_str = (f"Recv Throughput: {throughput:.2f} msgs/s | "
|
239
|
+
f"Send-Recv Latency (ms): "
|
240
|
+
f"Avg: {avg_latency*1000:.2f}, "
|
241
|
+
f"Min: {min_latency*1000:.2f}, "
|
242
|
+
f"Max: {max_latency*1000:.2f}")
|
243
|
+
logging.info(f"Benchmark stats: {stats_str}")
|
244
|
+
print(stats_str)
|
245
|
+
|
246
|
+
# Update the TensorWatch API with the latest metrics
|
247
|
+
if self.twapi_instance:
|
248
|
+
self.twapi_instance.update_metrics(stats_str)
|
249
|
+
|
250
|
+
# Reset stats for the next interval
|
251
|
+
self.latencies = []
|
252
|
+
self.received_count = 0
|
253
|
+
self.last_report_time = current_time
|
254
|
+
|
255
|
+
time.sleep(0.4)
|
@@ -0,0 +1,293 @@
|
|
1
|
+
from pykafka import KafkaClient
|
2
|
+
from pykafka.common import OffsetType
|
3
|
+
from tensorwatch import Watcher
|
4
|
+
import threading
|
5
|
+
from queue import Queue
|
6
|
+
import json
|
7
|
+
import pickle
|
8
|
+
import time
|
9
|
+
import random
|
10
|
+
import logging
|
11
|
+
import io
|
12
|
+
|
13
|
+
# Optional Parsers
|
14
|
+
try:
|
15
|
+
import xmltodict
|
16
|
+
except ImportError:
|
17
|
+
xmltodict = None
|
18
|
+
|
19
|
+
try:
|
20
|
+
import avro.schema
|
21
|
+
from avro.io import DatumReader, BinaryDecoder
|
22
|
+
except ImportError:
|
23
|
+
avro = None
|
24
|
+
|
25
|
+
try:
|
26
|
+
from protobuf_to_dict import protobuf_to_dict
|
27
|
+
except ImportError:
|
28
|
+
protobuf_to_dict = None
|
29
|
+
|
30
|
+
from probables import CountMinSketch
|
31
|
+
|
32
|
+
class pykafka_connector(threading.Thread):
|
33
|
+
"""
|
34
|
+
A Kafka consumer that uses the pykafka library to consume messages from a Kafka topic.
|
35
|
+
It runs in a separate thread and supports various message formats.
|
36
|
+
"""
|
37
|
+
def __init__(self, hosts: str = None, topic: str = None, parsetype: str = None, queue_length: int = None, cluster_size: int = 1,
|
38
|
+
consumer_group: bytes = b'default', auto_offset_reset: OffsetType = OffsetType.EARLIEST,
|
39
|
+
fetch_message_max_bytes: int = 1024 * 1024, num_consumer_fetchers: int = 1,
|
40
|
+
auto_commit_enable: bool = False, auto_commit_interval_ms: int = 1000,
|
41
|
+
queued_max_messages: int = 2000, fetch_min_bytes: int = 1,
|
42
|
+
consumer_timeout_ms: int = -1, decode: str = "utf-8",
|
43
|
+
scema_path: str = None, random_sampling: int = None, countmin_width: int = None,
|
44
|
+
countmin_depth: int = None, twapi_instance=None, parser_extra=None, probuf_message=None, zookeeper_hosts:str='127.0.0.1:2181'):
|
45
|
+
"""
|
46
|
+
Initializes the pykafka_connector.
|
47
|
+
|
48
|
+
Args:
|
49
|
+
hosts (str): Comma-separated list of Kafka brokers.
|
50
|
+
topic (str): The Kafka topic to consume from.
|
51
|
+
parsetype (str): The format of the messages (e.g., "json", "pickle", "xml", "avro", "protobuf").
|
52
|
+
queue_length (int): The maximum number of messages to store in the internal queue.
|
53
|
+
cluster_size (int): The number of consumer threads to run.
|
54
|
+
consumer_group (bytes): The consumer group ID.
|
55
|
+
auto_offset_reset (OffsetType): The offset reset policy.
|
56
|
+
fetch_message_max_bytes (int): The maximum size of a message to fetch.
|
57
|
+
num_consumer_fetchers (int): The number of fetcher threads.
|
58
|
+
auto_commit_enable (bool): Whether to enable auto-commit.
|
59
|
+
auto_commit_interval_ms (int): The auto-commit interval in milliseconds.
|
60
|
+
queued_max_messages (int): The maximum number of messages to queue.
|
61
|
+
fetch_min_bytes (int): The minimum number of bytes to fetch.
|
62
|
+
consumer_timeout_ms (int): The consumer timeout in milliseconds.
|
63
|
+
decode (str): The encoding to use for decoding messages.
|
64
|
+
scema_path (str): The path to the Avro or Protobuf schema.
|
65
|
+
random_sampling (int): The percentage of messages to sample (0-100).
|
66
|
+
countmin_width (int): The width of the Count-Min Sketch.
|
67
|
+
countmin_depth (int): The depth of the Count-Min Sketch.
|
68
|
+
twapi_instance: An instance of the TensorWatch API for updating metrics.
|
69
|
+
parser_extra (str): Extra information for the parser (e.g., Avro schema, Protobuf module).
|
70
|
+
probuf_message (str): The name of the Protobuf message class.
|
71
|
+
zookeeper_hosts (str): Comma-separated list of Zookeeper hosts.
|
72
|
+
"""
|
73
|
+
super().__init__()
|
74
|
+
self.hosts = hosts or "127.0.0.1:9092"
|
75
|
+
self.topic = topic
|
76
|
+
self.cluster_size = cluster_size
|
77
|
+
self.decode = decode
|
78
|
+
self.parsetype = parsetype
|
79
|
+
self.scema_path = scema_path
|
80
|
+
self.random_sampling = random_sampling
|
81
|
+
self.parser_extra = parser_extra
|
82
|
+
self.probuf_message = probuf_message
|
83
|
+
self.queue_length = queue_length
|
84
|
+
self.data = Queue(maxsize=queue_length or 50000)
|
85
|
+
self._quit = threading.Event()
|
86
|
+
self.size = 0
|
87
|
+
self.watcher = Watcher()
|
88
|
+
self.cms = {}
|
89
|
+
self.countmin_depth = countmin_depth
|
90
|
+
self.countmin_width = countmin_width
|
91
|
+
|
92
|
+
# pykafka specific settings
|
93
|
+
self.consumer_group = consumer_group
|
94
|
+
self.auto_offset_reset = auto_offset_reset
|
95
|
+
self.fetch_message_max_bytes = fetch_message_max_bytes
|
96
|
+
self.num_consumer_fetchers = num_consumer_fetchers
|
97
|
+
self.auto_commit_enable = auto_commit_enable
|
98
|
+
self.auto_commit_interval_ms = auto_commit_interval_ms
|
99
|
+
self.queued_max_messages = queued_max_messages
|
100
|
+
self.fetch_min_bytes = fetch_min_bytes
|
101
|
+
self.consumer_timeout_ms = consumer_timeout_ms
|
102
|
+
self.zookeeper_hosts = zookeeper_hosts
|
103
|
+
|
104
|
+
# twapi integration
|
105
|
+
self.twapi_instance = twapi_instance
|
106
|
+
self.latencies = []
|
107
|
+
self.received_count = 0
|
108
|
+
self.last_report_time = time.time()
|
109
|
+
self.first_message_sent = False
|
110
|
+
|
111
|
+
# Parsers initialization
|
112
|
+
self.reader = None
|
113
|
+
self.mymodule = None
|
114
|
+
if self.parsetype:
|
115
|
+
if self.parsetype.lower() == 'avro' and avro:
|
116
|
+
try:
|
117
|
+
schema = avro.schema.parse(parser_extra)
|
118
|
+
self.reader = DatumReader(schema)
|
119
|
+
except Exception as e:
|
120
|
+
print(f"Avro schema error or avro not installed: {e}")
|
121
|
+
elif self.parsetype.lower() == 'protobuf' and protobuf_to_dict:
|
122
|
+
try:
|
123
|
+
import sys
|
124
|
+
import importlib
|
125
|
+
if scema_path:
|
126
|
+
sys.path.append(scema_path)
|
127
|
+
mymodule = importlib.import_module(parser_extra)
|
128
|
+
method_to_call = getattr(mymodule, probuf_message)
|
129
|
+
self.mymodule = method_to_call
|
130
|
+
except Exception as e:
|
131
|
+
print(f"Error importing protobuf: {e}")
|
132
|
+
|
133
|
+
self.start()
|
134
|
+
|
135
|
+
def myparser(self, message):
|
136
|
+
"""
|
137
|
+
Parses a message based on the specified format.
|
138
|
+
|
139
|
+
Args:
|
140
|
+
message: The message to parse.
|
141
|
+
|
142
|
+
Returns:
|
143
|
+
The parsed message, or None if parsing fails.
|
144
|
+
"""
|
145
|
+
try:
|
146
|
+
if self.parsetype is None or self.parsetype.lower() == 'json':
|
147
|
+
return json.loads(message)
|
148
|
+
elif self.parsetype.lower() == 'pickle':
|
149
|
+
return pickle.loads(message)
|
150
|
+
elif self.parsetype.lower() == 'xml' and xmltodict:
|
151
|
+
return xmltodict.parse(message).get("root")
|
152
|
+
elif self.parsetype.lower() == 'protobuf' and self.mymodule:
|
153
|
+
dynamic_message = self.mymodule()
|
154
|
+
dynamic_message.ParseFromString(message)
|
155
|
+
return protobuf_to_dict(dynamic_message)
|
156
|
+
elif self.parsetype.lower() == 'avro' and self.reader:
|
157
|
+
message_bytes = io.BytesIO(message)
|
158
|
+
decoder = BinaryDecoder(message_bytes)
|
159
|
+
return self.reader.read(decoder)
|
160
|
+
except Exception as e:
|
161
|
+
logging.error(f"Parsing Error ({self.parsetype}): {e}")
|
162
|
+
return None
|
163
|
+
|
164
|
+
def process_message(self, message_bytes):
|
165
|
+
"""
|
166
|
+
Processes a single message from Kafka. This includes parsing, calculating latency,
|
167
|
+
and adding the message to the data queue.
|
168
|
+
"""
|
169
|
+
receive_time = time.time()
|
170
|
+
try:
|
171
|
+
# Apply random sampling if configured
|
172
|
+
if self.random_sampling and self.random_sampling > random.randint(0, 100):
|
173
|
+
return
|
174
|
+
|
175
|
+
parsed_message = self.myparser(message_bytes)
|
176
|
+
if parsed_message is None:
|
177
|
+
return
|
178
|
+
|
179
|
+
# Calculate and record latency if send_time is in the message
|
180
|
+
if isinstance(parsed_message, dict) and 'send_time' in parsed_message:
|
181
|
+
self.received_count += 1
|
182
|
+
send_time = parsed_message['send_time']
|
183
|
+
latency = receive_time - send_time
|
184
|
+
self.latencies.append(latency)
|
185
|
+
parsed_message['latency'] = latency
|
186
|
+
parsed_message['receive_time'] = receive_time
|
187
|
+
|
188
|
+
# Add the parsed message to the queue if it's not full
|
189
|
+
if not self.data.full():
|
190
|
+
self.data.put(parsed_message, block=False)
|
191
|
+
# Notify the twapi_instance on the first message
|
192
|
+
if not self.first_message_sent and self.twapi_instance:
|
193
|
+
logging.info("First message received, enabling apply button.")
|
194
|
+
self.twapi_instance.enable_apply_button()
|
195
|
+
self.first_message_sent = True
|
196
|
+
else:
|
197
|
+
logging.warning("Queue is full, dropping message.")
|
198
|
+
|
199
|
+
# Update Count-Min Sketch if configured
|
200
|
+
if isinstance(parsed_message, dict) and self.countmin_width and self.countmin_depth:
|
201
|
+
for key, value in parsed_message.items():
|
202
|
+
self.cms.setdefault(key, CountMinSketch(width=self.countmin_width, depth=self.countmin_depth))
|
203
|
+
self.cms[key].add(str(value))
|
204
|
+
|
205
|
+
self.size += 1
|
206
|
+
except Exception as e:
|
207
|
+
logging.error(f"Message Processing Error: {e}")
|
208
|
+
|
209
|
+
def consumer_loop(self):
|
210
|
+
"""
|
211
|
+
The main loop for the Kafka consumer. It creates a Kafka client and consumes messages
|
212
|
+
from the specified topic.
|
213
|
+
"""
|
214
|
+
logging.info(f"Starting pykafka consumer loop for topic '{self.topic}'")
|
215
|
+
client = KafkaClient(hosts=self.hosts)
|
216
|
+
topic = client.topics[self.topic]
|
217
|
+
|
218
|
+
# Use a balanced consumer if cluster_size is greater than 1
|
219
|
+
if self.cluster_size > 1:
|
220
|
+
consumer = topic.get_balanced_consumer(
|
221
|
+
consumer_group=self.consumer_group,
|
222
|
+
auto_commit_enable=self.auto_commit_enable,
|
223
|
+
auto_offset_reset=self.auto_offset_reset,
|
224
|
+
num_consumer_fetchers=self.num_consumer_fetchers,
|
225
|
+
auto_commit_interval_ms=self.auto_commit_interval_ms,
|
226
|
+
queued_max_messages=self.queued_max_messages,
|
227
|
+
fetch_min_bytes=self.fetch_min_bytes,
|
228
|
+
zookeeper_connect=self.zookeeper_hosts
|
229
|
+
)
|
230
|
+
else:
|
231
|
+
consumer = topic.get_simple_consumer(
|
232
|
+
auto_offset_reset=self.auto_offset_reset,
|
233
|
+
consumer_timeout_ms=self.consumer_timeout_ms,
|
234
|
+
fetch_message_max_bytes=self.fetch_message_max_bytes,
|
235
|
+
auto_commit_enable=self.auto_commit_enable,
|
236
|
+
auto_commit_interval_ms=self.auto_commit_interval_ms,
|
237
|
+
queued_max_messages=self.queued_max_messages,
|
238
|
+
fetch_min_bytes=self.fetch_min_bytes
|
239
|
+
)
|
240
|
+
|
241
|
+
for message in consumer:
|
242
|
+
if self._quit.is_set():
|
243
|
+
break
|
244
|
+
if message is not None:
|
245
|
+
self.process_message(message.value)
|
246
|
+
|
247
|
+
consumer.stop()
|
248
|
+
logging.info("Consumer loop stopped")
|
249
|
+
|
250
|
+
def run(self):
|
251
|
+
"""
|
252
|
+
Starts the consumer threads and the main watcher loop.
|
253
|
+
"""
|
254
|
+
logging.info(f"Starting {self.cluster_size} pykafka consumer threads")
|
255
|
+
threads = [threading.Thread(target=self.consumer_loop, daemon=True) for _ in range(self.cluster_size)]
|
256
|
+
for thread in threads:
|
257
|
+
thread.start()
|
258
|
+
|
259
|
+
while not self._quit.is_set():
|
260
|
+
# Observe the data queue with TensorWatch
|
261
|
+
if not self.data.empty():
|
262
|
+
self.watcher.observe(data=list(self.data.queue), size=self.size, cms=self.cms)
|
263
|
+
|
264
|
+
# --- BENCHMARK REPORTING ---
|
265
|
+
current_time = time.time()
|
266
|
+
if current_time - self.last_report_time > 5.0: # Report every 5 seconds
|
267
|
+
if self.latencies:
|
268
|
+
avg_latency = sum(self.latencies) / len(self.latencies)
|
269
|
+
max_latency = max(self.latencies)
|
270
|
+
min_latency = min(self.latencies)
|
271
|
+
time_since_last_report = current_time - self.last_report_time
|
272
|
+
throughput = self.received_count / time_since_last_report if time_since_last_report > 0 else 0
|
273
|
+
|
274
|
+
stats_str = (f"Recv Throughput: {throughput:.2f} msgs/s | "
|
275
|
+
f"Send-Recv Latency (ms): "
|
276
|
+
f"Avg: {avg_latency*1000:.2f}, "
|
277
|
+
f"Min: {min_latency*1000:.2f}, "
|
278
|
+
f"Max: {max_latency*1000:.2f}")
|
279
|
+
|
280
|
+
# Update the TensorWatch API with the latest metrics
|
281
|
+
if self.twapi_instance:
|
282
|
+
self.twapi_instance.update_metrics(stats_str)
|
283
|
+
|
284
|
+
# Reset stats for the next interval
|
285
|
+
self.latencies = []
|
286
|
+
self.received_count = 0
|
287
|
+
self.last_report_time = current_time
|
288
|
+
|
289
|
+
time.sleep(0.4)
|
290
|
+
|
291
|
+
def quit(self):
|
292
|
+
"""Stops the consumer thread."""
|
293
|
+
self._quit.set()
|
twapi/twapi.py
ADDED
@@ -0,0 +1,197 @@
|
|
1
|
+
import tensorwatch as tw
|
2
|
+
from . import kafka_connector as kc
|
3
|
+
from . import pykafka_connector as pyc
|
4
|
+
from IPython.display import display
|
5
|
+
from ipywidgets import widgets
|
6
|
+
import asyncio
|
7
|
+
import time
|
8
|
+
import logging
|
9
|
+
import matplotlib.pyplot as plt
|
10
|
+
|
11
|
+
class twapi:
|
12
|
+
"""TensorWatch API Wrapper for Kafka Streaming and Visualization"""
|
13
|
+
|
14
|
+
def __init__(self):
|
15
|
+
"""Initializes the twapi class, setting up the UI widgets and event handlers."""
|
16
|
+
self.default_value = 10
|
17
|
+
self.visualizer = None # Initialize visualizer as None
|
18
|
+
self.client = tw.WatcherClient()
|
19
|
+
self.out = widgets.Output(layout={})
|
20
|
+
|
21
|
+
# Initialize UI widgets
|
22
|
+
self.update_interval = 0.5 # Delay in seconds
|
23
|
+
self.my_slider = widgets.IntSlider(value=self.default_value, min=1, max=100, step=1, description="Window Size:")
|
24
|
+
self.my_slider2 = widgets.IntSlider(value=self.default_value, min=1, max=100, step=1, description="Window Width:")
|
25
|
+
self.datebutton = widgets.Checkbox(value=False, description="Date")
|
26
|
+
self.offsetbutton = widgets.Checkbox(value=False, description="Use Offset")
|
27
|
+
self.dimhistorybutton = widgets.Checkbox(value=True, description="Dim History")
|
28
|
+
self.colorpicker = widgets.ColorPicker(value="blue", description="Pick a Color")
|
29
|
+
|
30
|
+
self.button_reset = widgets.Button(description="Reset", tooltip="Reset stream settings")
|
31
|
+
self.button_apply = widgets.Button(description="Please wait", tooltip="Apply changes to the visualization", disabled=True)
|
32
|
+
|
33
|
+
# Group widgets for a cleaner UI
|
34
|
+
left_box = widgets.VBox([self.my_slider, self.my_slider2, self.colorpicker])
|
35
|
+
right_box = widgets.VBox([self.offsetbutton, self.dimhistorybutton, self.datebutton])
|
36
|
+
self.options_box = widgets.HBox([left_box, right_box])
|
37
|
+
self.accordion = widgets.Accordion(children=[self.options_box])
|
38
|
+
self.accordion.set_title(0, 'Visualization Options')
|
39
|
+
|
40
|
+
# Event handlers
|
41
|
+
self._last_update = time.time()
|
42
|
+
self.button_reset.on_click(self.reset)
|
43
|
+
self.button_apply.on_click(self.apply_with_debounce)
|
44
|
+
self.metrics_label = widgets.Label(value="")
|
45
|
+
|
46
|
+
# Observe widget changes directly
|
47
|
+
self.my_slider.observe(self.apply_with_debounce, names='value')
|
48
|
+
self.my_slider2.observe(self.apply_with_debounce, names='value')
|
49
|
+
self.colorpicker.observe(self.apply_with_debounce, names='value')
|
50
|
+
|
51
|
+
def stream(self, expr):
|
52
|
+
"""Creates a TensorWatch stream from an expression."""
|
53
|
+
self.expr = expr
|
54
|
+
try:
|
55
|
+
self.streamdata = self.client.create_stream(expr=expr)
|
56
|
+
logging.debug("Stream created successfully")
|
57
|
+
except Exception as e:
|
58
|
+
logging.error(f"Error creating stream: {e}")
|
59
|
+
print(f"Error creating stream: {e}")
|
60
|
+
return self
|
61
|
+
|
62
|
+
def apply_with_debounce(self, _=None):
|
63
|
+
"""Debounced apply function to prevent too frequent updates."""
|
64
|
+
now = time.time()
|
65
|
+
if now - self._last_update > self.update_interval:
|
66
|
+
self.update_visualizer()
|
67
|
+
self._last_update = now
|
68
|
+
if self.button_apply.description == "Start":
|
69
|
+
self.button_apply.description = "Apply Changes"
|
70
|
+
|
71
|
+
def update_visualizer(self, _=None):
|
72
|
+
"""Updates the TensorWatch visualizer with the latest widget values."""
|
73
|
+
if not hasattr(self, 'streamdata') or not self.streamdata:
|
74
|
+
self.out.clear_output(wait=True)
|
75
|
+
with self.out:
|
76
|
+
print("Stream data not available or empty yet. Please wait for data.")
|
77
|
+
return
|
78
|
+
|
79
|
+
try:
|
80
|
+
# Always clear output before drawing
|
81
|
+
self.out.clear_output(wait=True)
|
82
|
+
|
83
|
+
# Close previous visualizer if it exists to free resources
|
84
|
+
if self.visualizer:
|
85
|
+
self.visualizer.close()
|
86
|
+
plt.close('all') # Also close any lingering matplotlib figures
|
87
|
+
|
88
|
+
# Create a new visualizer with the current settings
|
89
|
+
self.visualizer = tw.Visualizer(
|
90
|
+
self.streamdata,
|
91
|
+
vis_type="line",
|
92
|
+
window_width=self.my_slider2.value,
|
93
|
+
window_size=self.my_slider.value,
|
94
|
+
Date=self.datebutton.value,
|
95
|
+
useOffset=self.offsetbutton.value,
|
96
|
+
dim_history=self.dimhistorybutton.value,
|
97
|
+
color=self.colorpicker.value,
|
98
|
+
)
|
99
|
+
with self.out:
|
100
|
+
self.visualizer.show()
|
101
|
+
|
102
|
+
except Exception as e:
|
103
|
+
self.out.clear_output(wait=True)
|
104
|
+
with self.out:
|
105
|
+
print(f"Error updating visualizer: {e}")
|
106
|
+
|
107
|
+
def enable_apply_button(self):
|
108
|
+
"""Enables the apply button and changes its description to 'Start'."""
|
109
|
+
logging.debug("Enabling apply button.")
|
110
|
+
self.button_apply.disabled = False
|
111
|
+
self.button_apply.description = "Start"
|
112
|
+
|
113
|
+
def reset(self, _=None):
|
114
|
+
"""Resets all widget values to their defaults and clears the visualization."""
|
115
|
+
self.my_slider.value = self.default_value
|
116
|
+
self.my_slider2.value = self.default_value
|
117
|
+
self.datebutton.value = False
|
118
|
+
self.offsetbutton.value = False
|
119
|
+
self.dimhistorybutton.value = True
|
120
|
+
self.colorpicker.value = "blue"
|
121
|
+
|
122
|
+
# Clear the output and close the visualizer
|
123
|
+
self.out.clear_output()
|
124
|
+
if self.visualizer:
|
125
|
+
self.visualizer.close()
|
126
|
+
plt.close('all')
|
127
|
+
self.visualizer = None
|
128
|
+
|
129
|
+
def draw(self):
|
130
|
+
"""Displays the UI for controlling the visualization."""
|
131
|
+
ui = widgets.VBox([
|
132
|
+
widgets.HBox([self.button_reset, self.button_apply]),
|
133
|
+
self.accordion,
|
134
|
+
self.out
|
135
|
+
])
|
136
|
+
display(ui)
|
137
|
+
|
138
|
+
def draw_with_metrics(self):
|
139
|
+
"""Displays the UI for controlling the visualization with a metrics label."""
|
140
|
+
ui = widgets.VBox([
|
141
|
+
self.metrics_label,
|
142
|
+
widgets.HBox([self.button_reset, self.button_apply]),
|
143
|
+
self.accordion,
|
144
|
+
self.out
|
145
|
+
])
|
146
|
+
display(ui)
|
147
|
+
|
148
|
+
def update_metrics(self, metrics):
|
149
|
+
"""Updates the metrics label with the provided text."""
|
150
|
+
self.metrics_label.value = metrics
|
151
|
+
|
152
|
+
def connector(self, topic, host, parsetype="json", cluster_size=1, conn_type="kafka", queue_length=50000,
|
153
|
+
group_id="mygroup", avro_schema=None, schema_path=None, protobuf_message=None, parser_extra=None,
|
154
|
+
random_sampling=None, countmin_width=None, countmin_depth=None):
|
155
|
+
"""
|
156
|
+
Creates and returns a Kafka or PyKafka connector.
|
157
|
+
|
158
|
+
Args:
|
159
|
+
topic (str): The Kafka topic to consume from.
|
160
|
+
host (str): The Kafka broker host.
|
161
|
+
parsetype (str): The message format (e.g., 'json', 'pickle', 'avro').
|
162
|
+
cluster_size (int): The number of consumer threads.
|
163
|
+
conn_type (str): The type of connector to use ('kafka' or 'pykafka').
|
164
|
+
queue_length (int): The maximum size of the message queue.
|
165
|
+
group_id (str): The Kafka consumer group ID.
|
166
|
+
avro_schema (str): The Avro schema for 'kafka' connector.
|
167
|
+
schema_path (str): The path to the schema file.
|
168
|
+
protobuf_message (str): The name of the Protobuf message class.
|
169
|
+
parser_extra (str): Extra data for the parser (e.g., Avro schema for 'pykafka').
|
170
|
+
random_sampling (int): The percentage of messages to sample.
|
171
|
+
countmin_width (int): The width of the Count-Min Sketch.
|
172
|
+
countmin_depth (int): The depth of the Count-Min Sketch.
|
173
|
+
|
174
|
+
Returns:
|
175
|
+
A KafkaConnector or pykafka_connector instance.
|
176
|
+
"""
|
177
|
+
if conn_type == "kafka":
|
178
|
+
return kc.KafkaConnector(
|
179
|
+
topic=topic, hosts=host, parsetype=parsetype, cluster_size=cluster_size,
|
180
|
+
twapi_instance=self, queue_length=queue_length, group_id=group_id,
|
181
|
+
avro_schema=avro_schema, schema_path=schema_path, protobuf_message=protobuf_message,
|
182
|
+
random_sampling=random_sampling, countmin_width=countmin_width,
|
183
|
+
countmin_depth=countmin_depth)
|
184
|
+
elif conn_type == "pykafka":
|
185
|
+
return pyc.pykafka_connector(
|
186
|
+
topic=topic, hosts=host, parsetype=parsetype, cluster_size=cluster_size,twapi_instance=self,
|
187
|
+
queue_length=queue_length, consumer_group=bytes(group_id, 'utf-8'),
|
188
|
+
parser_extra=parser_extra, scema_path=schema_path, probuf_message=protobuf_message,
|
189
|
+
random_sampling=random_sampling, countmin_width=countmin_width,
|
190
|
+
countmin_depth=countmin_depth)
|
191
|
+
else:
|
192
|
+
raise ValueError("Invalid connector type. Choose 'kafka' or 'pykafka'.")
|
193
|
+
|
194
|
+
async def some_async_function(self):
|
195
|
+
"""Example of an async function that can be called."""
|
196
|
+
await asyncio.sleep(1)
|
197
|
+
print("Async function completed")
|