mooncake-transfer-engine 0.1.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,20 @@
1
+ Metadata-Version: 2.4
2
+ Name: mooncake-transfer-engine
3
+ Version: 0.1.0
4
+ Summary: Python binding of a Mooncake library using pybind11
5
+ Home-page: https://github.com/kvcache-ai/Mooncake
6
+ Author: Mooncake Authors
7
+ Project-URL: Documentation, https://github.com/kvcache-ai/Mooncake/tree/main/doc
8
+ Project-URL: Source, https://github.com/kvcache-ai/Mooncake
9
+ Project-URL: Issues, https://github.com/kvcache-ai/Mooncake/issues
10
+ Keywords: mooncake,data transfer,kv cache,llm inference
11
+ Classifier: Programming Language :: Python :: 3
12
+ Classifier: Programming Language :: C++
13
+ Classifier: Operating System :: POSIX :: Linux
14
+ Classifier: License :: OSI Approved :: Apache Software License
15
+ Dynamic: author
16
+ Dynamic: classifier
17
+ Dynamic: home-page
18
+ Dynamic: keywords
19
+ Dynamic: project-url
20
+ Dynamic: summary
@@ -0,0 +1,3 @@
1
+ from .mooncake_vllm_adaptor import MooncakeDistributedStore
2
+ from .mooncake_vllm_adaptor import mooncake_vllm_adaptor
3
+
@@ -0,0 +1,20 @@
1
+ Metadata-Version: 2.4
2
+ Name: mooncake-transfer-engine
3
+ Version: 0.1.0
4
+ Summary: Python binding of a Mooncake library using pybind11
5
+ Home-page: https://github.com/kvcache-ai/Mooncake
6
+ Author: Mooncake Authors
7
+ Project-URL: Documentation, https://github.com/kvcache-ai/Mooncake/tree/main/doc
8
+ Project-URL: Source, https://github.com/kvcache-ai/Mooncake
9
+ Project-URL: Issues, https://github.com/kvcache-ai/Mooncake/issues
10
+ Keywords: mooncake,data transfer,kv cache,llm inference
11
+ Classifier: Programming Language :: Python :: 3
12
+ Classifier: Programming Language :: C++
13
+ Classifier: Operating System :: POSIX :: Linux
14
+ Classifier: License :: OSI Approved :: Apache Software License
15
+ Dynamic: author
16
+ Dynamic: classifier
17
+ Dynamic: home-page
18
+ Dynamic: keywords
19
+ Dynamic: project-url
20
+ Dynamic: summary
@@ -0,0 +1,13 @@
1
+ pyproject.toml
2
+ setup.py
3
+ mooncake/__init__.py
4
+ mooncake/mooncake_master
5
+ mooncake/mooncake_sglang_adaptor.cpython-310-x86_64-linux-gnu.so
6
+ mooncake/mooncake_vllm_adaptor.cpython-310-x86_64-linux-gnu.so
7
+ mooncake/lib_so/libetcd-cpp-api.so
8
+ mooncake_transfer_engine.egg-info/PKG-INFO
9
+ mooncake_transfer_engine.egg-info/SOURCES.txt
10
+ mooncake_transfer_engine.egg-info/dependency_links.txt
11
+ mooncake_transfer_engine.egg-info/not-zip-safe
12
+ mooncake_transfer_engine.egg-info/top_level.txt
13
+ tests/test_distributed_object_store.py
@@ -0,0 +1,3 @@
1
+ [build-system]
2
+ requires = ["setuptools", "wheel"]
3
+ build-backend = "setuptools.build_meta"
@@ -0,0 +1,4 @@
1
+ [egg_info]
2
+ tag_build =
3
+ tag_date = 0
4
+
@@ -0,0 +1,52 @@
1
+ import sys
2
+ import os
3
+ from setuptools import setup, find_packages
4
+ from setuptools.dist import Distribution
5
+ from wheel.bdist_wheel import bdist_wheel
6
+
7
+ class BinaryDistribution(Distribution):
8
+ def has_ext_modules(self):
9
+ return True
10
+
11
+ class CustomBdistWheel(bdist_wheel):
12
+ def finalize_options(self):
13
+ bdist_wheel.finalize_options(self)
14
+ self.root_is_pure = False
15
+ self.plat_name_supplied = True
16
+ self.plat_name = "manylinux2014_x86_64"
17
+
18
+ python_version = f">={sys.version_info.major}.{sys.version_info.minor}"
19
+
20
+ VERSION = os.environ.get("VERSION", "0.1.0")
21
+
22
+ setup(
23
+ name="mooncake-transfer-engine",
24
+ version=VERSION,
25
+ packages=find_packages(),
26
+ package_data={"mooncake": [
27
+ "*.so",
28
+ "mooncake_master",
29
+ "lib_so/libetcd-cpp-api.so",
30
+ ]},
31
+ include_package_data=True,
32
+ zip_safe=False,
33
+ distclass=BinaryDistribution,
34
+ cmdclass={
35
+ 'bdist_wheel': CustomBdistWheel,
36
+ },
37
+ author="Mooncake Authors",
38
+ description="Python binding of a Mooncake library using pybind11",
39
+ url="https://github.com/kvcache-ai/Mooncake",
40
+ project_urls={
41
+ "Documentation": "https://github.com/kvcache-ai/Mooncake/tree/main/doc",
42
+ "Source": "https://github.com/kvcache-ai/Mooncake",
43
+ "Issues": "https://github.com/kvcache-ai/Mooncake/issues",
44
+ },
45
+ keywords=["mooncake", "data transfer", "kv cache", "llm inference"],
46
+ classifiers=[
47
+ "Programming Language :: Python :: 3",
48
+ "Programming Language :: C++",
49
+ "Operating System :: POSIX :: Linux",
50
+ "License :: OSI Approved :: Apache Software License",
51
+ ]
52
+ )
@@ -0,0 +1,271 @@
1
+ import unittest
2
+ import os
3
+ import time
4
+ import threading
5
+ import random
6
+ from mooncake import MooncakeDistributedStore
7
+
8
+
9
+ def get_client(store):
10
+ """Initialize and setup the distributed store client."""
11
+ protocol = os.getenv("PROTOCOL", "tcp")
12
+ device_name = os.getenv("DEVICE_NAME", "ibp6s0")
13
+ local_hostname = os.getenv("LOCAL_HOSTNAME", "localhost")
14
+ metadata_server = os.getenv("MC_METADATA_SERVER", "127.0.0.1:2379")
15
+ global_segment_size = 3200 * 1024 * 1024 # 3200 MB
16
+ local_buffer_size = 512 * 1024 * 1024 # 512 MB
17
+ master_server_address = os.getenv("MASTER_SERVER", "127.0.0.1:50051")
18
+
19
+ retcode = store.setup(
20
+ local_hostname,
21
+ metadata_server,
22
+ global_segment_size,
23
+ local_buffer_size,
24
+ protocol,
25
+ device_name,
26
+ master_server_address
27
+ )
28
+
29
+ if retcode:
30
+ raise RuntimeError(f"Failed to setup store client. Return code: {retcode}")
31
+
32
+
33
+ class TestDistributedObjectStore(unittest.TestCase):
34
+ @classmethod
35
+ def setUpClass(cls):
36
+ """Initialize the store once for all tests."""
37
+ cls.store = MooncakeDistributedStore()
38
+ get_client(cls.store)
39
+
40
+ def test_client_tear_down(self):
41
+ """Test client tear down and re-initialization."""
42
+ test_data = b"Hello, World!"
43
+ key = "test_teardown_key"
44
+
45
+ # Put data and verify teardown clears it
46
+ self.assertEqual(self.store.put(key, test_data), 0)
47
+ self.assertEqual(self.store.close(), 0)
48
+ time.sleep(1) # Allow time for teardown to complete
49
+
50
+ # Re-initialize the store
51
+ get_client(self.store)
52
+
53
+ # Verify data is gone after teardown
54
+ retrieved_data = self.store.get(key)
55
+ self.assertEqual(retrieved_data, b"")
56
+
57
+ # Verify store is functional after re-initialization
58
+ self.assertEqual(self.store.put(key, test_data), 0)
59
+ retrieved_data = self.store.get(key)
60
+ self.assertEqual(retrieved_data, test_data)
61
+
62
+ def test_basic_put_get_exist_operations(self):
63
+ """Test basic Put/Get/Exist operations through the Python interface."""
64
+ test_data = b"Hello, World!"
65
+ key = "test_basic_key"
66
+
67
+ # Test Put operation
68
+ self.assertEqual(self.store.put(key, test_data), 0)
69
+
70
+ # Verify data through Get operation
71
+ self.assertEqual(self.store.getSize(key), len(test_data))
72
+ retrieved_data = self.store.get(key)
73
+ self.assertEqual(retrieved_data, test_data)
74
+
75
+ # Put again with the same key, should succeed
76
+ self.assertEqual(self.store.put(key, test_data), 0)
77
+
78
+ # Remove the key
79
+ self.assertEqual(self.store.remove(key), 0)
80
+
81
+ # Get after remove should return empty bytes
82
+ self.assertLess(self.store.getSize(key), 0)
83
+ empty_data = self.store.get(key)
84
+ self.assertEqual(empty_data, b"")
85
+
86
+ # Test isExist functionality
87
+ test_data_2 = b"Testing exists!"
88
+ key_2 = "test_exist_key"
89
+
90
+ # Should not exist initially
91
+ self.assertLess(self.store.getSize(key_2), 0)
92
+ self.assertEqual(self.store.isExist(key_2), 0)
93
+
94
+ # Should exist after put
95
+ self.assertEqual(self.store.put(key_2, test_data_2), 0)
96
+ self.assertEqual(self.store.isExist(key_2), 1)
97
+ self.assertEqual(self.store.getSize(key_2), len(test_data_2))
98
+
99
+ # Should not exist after remove
100
+ self.assertEqual(self.store.remove(key_2), 0)
101
+ self.assertLess(self.store.getSize(key_2), 0)
102
+ self.assertEqual(self.store.isExist(key_2), 0)
103
+
104
+ def test_concurrent_stress_with_barrier(self):
105
+ """Test concurrent Put/Get operations with multiple threads using barrier."""
106
+ NUM_THREADS = 8
107
+ VALUE_SIZE = 1024 * 1024 # 1MB
108
+ OPERATIONS_PER_THREAD = 100
109
+
110
+ # Create barriers for synchronization
111
+ start_barrier = threading.Barrier(NUM_THREADS + 1) # +1 for main thread
112
+ put_barrier = threading.Barrier(NUM_THREADS + 1) # Barrier after put operations
113
+ get_barrier = threading.Barrier(NUM_THREADS + 1) # Barrier after get operations
114
+
115
+ # Statistics for system-wide timing
116
+ system_stats = {
117
+ 'put_start': 0,
118
+ 'put_end': 0,
119
+ 'get_start': 0,
120
+ 'get_end': 0
121
+ }
122
+ thread_exceptions = []
123
+
124
+ def worker(thread_id):
125
+ try:
126
+ # Generate test data (1MB)
127
+ test_data = os.urandom(VALUE_SIZE)
128
+ thread_keys = [f"key_{thread_id}_{i}" for i in range(OPERATIONS_PER_THREAD)]
129
+
130
+ # Wait for all threads to be ready
131
+ start_barrier.wait()
132
+
133
+ # Put operations
134
+ for key in thread_keys:
135
+ result = self.store.put(key, test_data)
136
+ self.assertEqual(result, 0, f"Put operation failed for key {key}")
137
+
138
+ # Wait for all threads to complete put operations
139
+ put_barrier.wait()
140
+
141
+ # Get operations
142
+ for key in thread_keys:
143
+ retrieved_data = self.store.get(key)
144
+ self.assertEqual(len(retrieved_data), VALUE_SIZE,
145
+ f"Retrieved data size mismatch for key {key}")
146
+ self.assertEqual(retrieved_data, test_data,
147
+ f"Retrieved data content mismatch for key {key}")
148
+
149
+ # Wait for all threads to complete get operations
150
+ get_barrier.wait()
151
+
152
+ # Remove all keys
153
+ for key in thread_keys:
154
+ self.assertEqual(self.store.remove(key), 0)
155
+
156
+
157
+ except Exception as e:
158
+ thread_exceptions.append(f"Thread {thread_id} failed: {str(e)}")
159
+
160
+ # Create and start threads
161
+ threads = []
162
+ for i in range(NUM_THREADS):
163
+ t = threading.Thread(target=worker, args=(i,), name=f"Worker-{i}")
164
+ threads.append(t)
165
+ t.start()
166
+
167
+ # Wait for all threads to be ready and start the test
168
+ start_barrier.wait()
169
+
170
+ # Record put start time
171
+ system_stats['put_start'] = time.time()
172
+
173
+ # Wait for all put operations to complete
174
+ put_barrier.wait()
175
+ system_stats['put_end'] = time.time()
176
+
177
+ # Record get start time
178
+ system_stats['get_start'] = time.time()
179
+
180
+ # Wait for all get operations to complete
181
+ get_barrier.wait()
182
+ system_stats['get_end'] = time.time()
183
+
184
+
185
+ # Join all threads
186
+ for t in threads:
187
+ t.join()
188
+
189
+ # Check for any exceptions
190
+ self.assertEqual(len(thread_exceptions), 0, "\n".join(thread_exceptions))
191
+
192
+ # Calculate system-wide statistics
193
+ total_operations = NUM_THREADS * OPERATIONS_PER_THREAD
194
+ put_duration = system_stats['put_end'] - system_stats['put_start']
195
+ get_duration = system_stats['get_end'] - system_stats['get_start']
196
+ total_data_size_gb = (VALUE_SIZE * total_operations) / (1024**3)
197
+
198
+ print(f"\nConcurrent Stress Test Results:")
199
+ print(f"Total threads: {NUM_THREADS}")
200
+ print(f"Operations per thread: {OPERATIONS_PER_THREAD}")
201
+ print(f"Total operations: {total_operations}")
202
+ print(f"Data block size: {VALUE_SIZE/1024/1024:.2f}MB")
203
+ print(f"Total data processed: {total_data_size_gb:.2f}GB")
204
+ print(f"Put duration: {put_duration:.2f} seconds")
205
+ print(f"Get duration: {get_duration:.2f} seconds")
206
+ print(f"System Put throughput: {total_operations/put_duration:.2f} ops/sec")
207
+ print(f"System Get throughput: {total_operations/get_duration:.2f} ops/sec")
208
+ print(f"System Put bandwidth: {total_data_size_gb/put_duration:.2f} GB/sec")
209
+ print(f"System Get bandwidth: {total_data_size_gb/get_duration:.2f} GB/sec")
210
+
211
+ def test_dict_fuzz_e2e(self):
212
+ """End-to-end fuzz test comparing distributed store behavior with dict.
213
+ Performs ~1000 random operations (put, get, remove) with random value sizes between 1KB and 64MB.
214
+ After testing, all keys are removed.
215
+ """
216
+ import random
217
+ # Local reference dict to simulate expected dict behavior
218
+ reference = {}
219
+ operations = 1000
220
+ # Use a pool of keys to limit memory consumption
221
+ keys_pool = [f"key_{i}" for i in range(100)]
222
+ # Track which keys have values assigned to ensure consistency
223
+ key_values = {}
224
+ # Fuzz record for debugging in case of errors
225
+ fuzz_record = []
226
+ try:
227
+ for i in range(operations):
228
+ op = random.choice(["put", "get", "remove"])
229
+ key = random.choice(keys_pool)
230
+ if op == "put":
231
+ # If key already exists, use the same value to ensure consistency
232
+ if key in key_values:
233
+ value = key_values[key]
234
+ size = len(value)
235
+ else:
236
+ size = random.randint(1, 64 * 1024 * 1024)
237
+ value = os.urandom(size)
238
+ key_values[key] = value
239
+
240
+ fuzz_record.append(f"{i}: put {key} [size: {size}]")
241
+ error_code = self.store.put(key, value)
242
+ if error_code == -200:
243
+ # The space is not enough, continue to next operation
244
+ continue
245
+ elif error_code == 0:
246
+ reference[key] = value
247
+ else:
248
+ raise RuntimeError(f"Put operation failed for key {key}. Error code: {error_code}")
249
+ elif op == "get":
250
+ fuzz_record.append(f"{i}: get {key}")
251
+ retrieved = self.store.get(key)
252
+ expected = reference.get(key, b"")
253
+ self.assertEqual(retrieved, expected)
254
+ elif op == "remove":
255
+ fuzz_record.append(f"{i}: remove {key}")
256
+ self.store.remove(key)
257
+ reference.pop(key, None)
258
+ # Also remove from key_values to allow new value if key is reused
259
+ key_values.pop(key, None)
260
+ except Exception as e:
261
+ print(f"Error: {e}")
262
+ print('\nFuzz record (operations so far):')
263
+ for record in fuzz_record:
264
+ print(record)
265
+ raise e
266
+ # Cleanup: ensure all remaining keys are removed
267
+ for key in list(reference.keys()):
268
+ self.store.remove(key)
269
+
270
+ if __name__ == '__main__':
271
+ unittest.main()