nexaroa 0.0.111__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- neuroshard/__init__.py +93 -0
- neuroshard/__main__.py +4 -0
- neuroshard/cli.py +466 -0
- neuroshard/core/__init__.py +92 -0
- neuroshard/core/consensus/verifier.py +252 -0
- neuroshard/core/crypto/__init__.py +20 -0
- neuroshard/core/crypto/ecdsa.py +392 -0
- neuroshard/core/economics/__init__.py +52 -0
- neuroshard/core/economics/constants.py +387 -0
- neuroshard/core/economics/ledger.py +2111 -0
- neuroshard/core/economics/market.py +975 -0
- neuroshard/core/economics/wallet.py +168 -0
- neuroshard/core/governance/__init__.py +74 -0
- neuroshard/core/governance/proposal.py +561 -0
- neuroshard/core/governance/registry.py +545 -0
- neuroshard/core/governance/versioning.py +332 -0
- neuroshard/core/governance/voting.py +453 -0
- neuroshard/core/model/__init__.py +30 -0
- neuroshard/core/model/dynamic.py +4186 -0
- neuroshard/core/model/llm.py +905 -0
- neuroshard/core/model/registry.py +164 -0
- neuroshard/core/model/scaler.py +387 -0
- neuroshard/core/model/tokenizer.py +568 -0
- neuroshard/core/network/__init__.py +56 -0
- neuroshard/core/network/connection_pool.py +72 -0
- neuroshard/core/network/dht.py +130 -0
- neuroshard/core/network/dht_plan.py +55 -0
- neuroshard/core/network/dht_proof_store.py +516 -0
- neuroshard/core/network/dht_protocol.py +261 -0
- neuroshard/core/network/dht_service.py +506 -0
- neuroshard/core/network/encrypted_channel.py +141 -0
- neuroshard/core/network/nat.py +201 -0
- neuroshard/core/network/nat_traversal.py +695 -0
- neuroshard/core/network/p2p.py +929 -0
- neuroshard/core/network/p2p_data.py +150 -0
- neuroshard/core/swarm/__init__.py +106 -0
- neuroshard/core/swarm/aggregation.py +729 -0
- neuroshard/core/swarm/buffers.py +643 -0
- neuroshard/core/swarm/checkpoint.py +709 -0
- neuroshard/core/swarm/compute.py +624 -0
- neuroshard/core/swarm/diloco.py +844 -0
- neuroshard/core/swarm/factory.py +1288 -0
- neuroshard/core/swarm/heartbeat.py +669 -0
- neuroshard/core/swarm/logger.py +487 -0
- neuroshard/core/swarm/router.py +658 -0
- neuroshard/core/swarm/service.py +640 -0
- neuroshard/core/training/__init__.py +29 -0
- neuroshard/core/training/checkpoint.py +600 -0
- neuroshard/core/training/distributed.py +1602 -0
- neuroshard/core/training/global_tracker.py +617 -0
- neuroshard/core/training/production.py +276 -0
- neuroshard/governance_cli.py +729 -0
- neuroshard/grpc_server.py +895 -0
- neuroshard/runner.py +3223 -0
- neuroshard/sdk/__init__.py +92 -0
- neuroshard/sdk/client.py +990 -0
- neuroshard/sdk/errors.py +101 -0
- neuroshard/sdk/types.py +282 -0
- neuroshard/tracker/__init__.py +0 -0
- neuroshard/tracker/server.py +864 -0
- neuroshard/ui/__init__.py +0 -0
- neuroshard/ui/app.py +102 -0
- neuroshard/ui/templates/index.html +1052 -0
- neuroshard/utils/__init__.py +0 -0
- neuroshard/utils/autostart.py +81 -0
- neuroshard/utils/hardware.py +121 -0
- neuroshard/utils/serialization.py +90 -0
- neuroshard/version.py +1 -0
- nexaroa-0.0.111.dist-info/METADATA +283 -0
- nexaroa-0.0.111.dist-info/RECORD +78 -0
- nexaroa-0.0.111.dist-info/WHEEL +5 -0
- nexaroa-0.0.111.dist-info/entry_points.txt +4 -0
- nexaroa-0.0.111.dist-info/licenses/LICENSE +190 -0
- nexaroa-0.0.111.dist-info/top_level.txt +2 -0
- protos/__init__.py +0 -0
- protos/neuroshard.proto +651 -0
- protos/neuroshard_pb2.py +160 -0
- protos/neuroshard_pb2_grpc.py +1298 -0
|
@@ -0,0 +1,568 @@
|
|
|
1
|
+
"""
|
|
2
|
+
NeuroLLM Tokenizer
|
|
3
|
+
|
|
4
|
+
A BPE (Byte Pair Encoding) tokenizer for NeuroLLM that is trained from scratch
|
|
5
|
+
by the network itself.
|
|
6
|
+
tokenizer - it's a truly decentralized tokenizer that grows with the network.
|
|
7
|
+
|
|
8
|
+
The tokenizer starts with a base vocabulary (bytes + special tokens) and learns
|
|
9
|
+
new subword units as more training data is contributed by the network.
|
|
10
|
+
|
|
11
|
+
Features:
|
|
12
|
+
- Pure BPE implementation (no external dependencies for core functionality)
|
|
13
|
+
- Starts with byte-level vocabulary (256 tokens)
|
|
14
|
+
- Learns merges from contributed training data
|
|
15
|
+
- Can be updated through network consensus
|
|
16
|
+
- Fully serializable for checkpoint distribution
|
|
17
|
+
"""
|
|
18
|
+
|
|
19
|
+
import os
|
|
20
|
+
import json
|
|
21
|
+
import logging
|
|
22
|
+
import re
|
|
23
|
+
from typing import List, Dict, Optional, Tuple, Set
|
|
24
|
+
from collections import Counter
|
|
25
|
+
from pathlib import Path
|
|
26
|
+
|
|
27
|
+
logger = logging.getLogger(__name__)
|
|
28
|
+
|
|
29
|
+
|
|
30
|
+
class NeuroTokenizer:
|
|
31
|
+
"""
|
|
32
|
+
A truly decentralized BPE tokenizer for NeuroLLM.
|
|
33
|
+
|
|
34
|
+
Unlike traditional tokenizers that are pre-trained on massive corpora,
|
|
35
|
+
this tokenizer starts with a minimal vocabulary and learns from the
|
|
36
|
+
training data contributed by network participants.
|
|
37
|
+
"""
|
|
38
|
+
|
|
39
|
+
# Special tokens (reserved IDs 0-9)
|
|
40
|
+
PAD_TOKEN = "<|pad|>"
|
|
41
|
+
BOS_TOKEN = "<|bos|>"
|
|
42
|
+
EOS_TOKEN = "<|eos|>"
|
|
43
|
+
UNK_TOKEN = "<|unk|>"
|
|
44
|
+
|
|
45
|
+
PAD_ID = 0
|
|
46
|
+
BOS_ID = 1
|
|
47
|
+
EOS_ID = 2
|
|
48
|
+
UNK_ID = 3
|
|
49
|
+
|
|
50
|
+
# Byte tokens start at ID 10 (256 bytes = IDs 10-265)
|
|
51
|
+
BYTE_OFFSET = 10
|
|
52
|
+
|
|
53
|
+
# Learned merges start at ID 266
|
|
54
|
+
MERGE_OFFSET = 266
|
|
55
|
+
|
|
56
|
+
def __init__(self, vocab_size: int = 10_000_000): # 10M - effectively unlimited
|
|
57
|
+
"""
|
|
58
|
+
Initialize the NeuroLLM tokenizer.
|
|
59
|
+
|
|
60
|
+
Args:
|
|
61
|
+
vocab_size: Maximum vocabulary size. Default 10M is effectively unlimited.
|
|
62
|
+
The tokenizer can grow as large as needed - memory is the only
|
|
63
|
+
real constraint. For reference: GPT-4 ~100K, most LLMs ~32K-256K.
|
|
64
|
+
"""
|
|
65
|
+
self.vocab_size = vocab_size
|
|
66
|
+
|
|
67
|
+
# Core vocabulary
|
|
68
|
+
self.special_tokens = {
|
|
69
|
+
self.PAD_TOKEN: self.PAD_ID,
|
|
70
|
+
self.BOS_TOKEN: self.BOS_ID,
|
|
71
|
+
self.EOS_TOKEN: self.EOS_ID,
|
|
72
|
+
self.UNK_TOKEN: self.UNK_ID,
|
|
73
|
+
}
|
|
74
|
+
|
|
75
|
+
# Byte vocabulary (256 bytes)
|
|
76
|
+
self.byte_to_id = {i: i + self.BYTE_OFFSET for i in range(256)}
|
|
77
|
+
self.id_to_byte = {v: k for k, v in self.byte_to_id.items()}
|
|
78
|
+
|
|
79
|
+
# Learned BPE merges: (token1, token2) -> merged_token_id
|
|
80
|
+
self.merges: Dict[Tuple[int, int], int] = {}
|
|
81
|
+
self.merge_to_tokens: Dict[int, Tuple[int, int]] = {} # Reverse lookup
|
|
82
|
+
|
|
83
|
+
# Token to string (for decoding merged tokens)
|
|
84
|
+
self.id_to_string: Dict[int, str] = {}
|
|
85
|
+
|
|
86
|
+
# Next available ID for new merges
|
|
87
|
+
self.next_merge_id = self.MERGE_OFFSET
|
|
88
|
+
|
|
89
|
+
# Track which data sources have contributed merges
|
|
90
|
+
# Format: {"source_name": num_merges_contributed}
|
|
91
|
+
self.sources_contributed: Dict[str, int] = {}
|
|
92
|
+
|
|
93
|
+
# Statistics
|
|
94
|
+
self.total_tokens_processed = 0
|
|
95
|
+
|
|
96
|
+
logger.info(f"NeuroTokenizer initialized with vocab_size={vocab_size}")
|
|
97
|
+
|
|
98
|
+
@property
|
|
99
|
+
def pad_token_id(self) -> int:
|
|
100
|
+
return self.PAD_ID
|
|
101
|
+
|
|
102
|
+
@property
|
|
103
|
+
def bos_token_id(self) -> int:
|
|
104
|
+
return self.BOS_ID
|
|
105
|
+
|
|
106
|
+
@property
|
|
107
|
+
def eos_token_id(self) -> int:
|
|
108
|
+
return self.EOS_ID
|
|
109
|
+
|
|
110
|
+
@property
|
|
111
|
+
def unk_token_id(self) -> int:
|
|
112
|
+
return self.UNK_ID
|
|
113
|
+
|
|
114
|
+
@property
|
|
115
|
+
def current_vocab_size(self) -> int:
|
|
116
|
+
"""
|
|
117
|
+
The current vocabulary size (valid token IDs: 0 to current_vocab_size-1).
|
|
118
|
+
|
|
119
|
+
This grows as the tokenizer learns BPE merges:
|
|
120
|
+
- Initial: 266 (10 special + 256 bytes)
|
|
121
|
+
- After learning: 266 + num_merges
|
|
122
|
+
- Maximum: vocab_size (10M default - effectively unlimited)
|
|
123
|
+
|
|
124
|
+
IMPORTANT: During inference, only tokens 0 to current_vocab_size-1 are valid.
|
|
125
|
+
Tokens beyond this have no learned representation and should not be sampled.
|
|
126
|
+
"""
|
|
127
|
+
return self.next_merge_id
|
|
128
|
+
|
|
129
|
+
def _text_to_bytes(self, text: str) -> List[int]:
|
|
130
|
+
"""Convert text to byte-level token IDs."""
|
|
131
|
+
return [self.byte_to_id[b] for b in text.encode('utf-8')]
|
|
132
|
+
|
|
133
|
+
def _apply_merges(self, token_ids: List[int]) -> List[int]:
|
|
134
|
+
"""
|
|
135
|
+
Apply learned BPE merges to a sequence of token IDs.
|
|
136
|
+
|
|
137
|
+
OPTIMIZED: Uses heap-based approach for O(n log n) instead of O(n²).
|
|
138
|
+
Merges are applied in priority order (lower merge ID = higher priority).
|
|
139
|
+
"""
|
|
140
|
+
if not self.merges or len(token_ids) <= 1:
|
|
141
|
+
return token_ids
|
|
142
|
+
|
|
143
|
+
import heapq
|
|
144
|
+
|
|
145
|
+
# Convert to list for in-place modification
|
|
146
|
+
tokens = list(token_ids)
|
|
147
|
+
n = len(tokens)
|
|
148
|
+
|
|
149
|
+
# Track which positions are "deleted" (merged into previous)
|
|
150
|
+
deleted = [False] * n
|
|
151
|
+
|
|
152
|
+
# Build initial heap of mergeable pairs: (merge_id, position)
|
|
153
|
+
# Lower merge_id = higher priority (learned earlier = more frequent)
|
|
154
|
+
heap = []
|
|
155
|
+
for i in range(n - 1):
|
|
156
|
+
pair = (tokens[i], tokens[i + 1])
|
|
157
|
+
if pair in self.merges:
|
|
158
|
+
heapq.heappush(heap, (self.merges[pair], i))
|
|
159
|
+
|
|
160
|
+
while heap:
|
|
161
|
+
merge_id, pos = heapq.heappop(heap)
|
|
162
|
+
|
|
163
|
+
# Skip if position was already processed
|
|
164
|
+
if pos >= n - 1 or deleted[pos]:
|
|
165
|
+
continue
|
|
166
|
+
|
|
167
|
+
# Find actual next non-deleted position
|
|
168
|
+
next_pos = pos + 1
|
|
169
|
+
while next_pos < n and deleted[next_pos]:
|
|
170
|
+
next_pos += 1
|
|
171
|
+
|
|
172
|
+
if next_pos >= n:
|
|
173
|
+
continue
|
|
174
|
+
|
|
175
|
+
# Check if this merge still applies
|
|
176
|
+
pair = (tokens[pos], tokens[next_pos])
|
|
177
|
+
if pair not in self.merges or self.merges[pair] != merge_id:
|
|
178
|
+
continue
|
|
179
|
+
|
|
180
|
+
# Apply merge: replace token at pos, mark next_pos as deleted
|
|
181
|
+
tokens[pos] = merge_id
|
|
182
|
+
deleted[next_pos] = True
|
|
183
|
+
|
|
184
|
+
# Find previous non-deleted position
|
|
185
|
+
prev_pos = pos - 1
|
|
186
|
+
while prev_pos >= 0 and deleted[prev_pos]:
|
|
187
|
+
prev_pos -= 1
|
|
188
|
+
|
|
189
|
+
# Find next-next non-deleted position
|
|
190
|
+
next_next_pos = next_pos + 1
|
|
191
|
+
while next_next_pos < n and deleted[next_next_pos]:
|
|
192
|
+
next_next_pos += 1
|
|
193
|
+
|
|
194
|
+
# Add new potential merges to heap
|
|
195
|
+
if prev_pos >= 0:
|
|
196
|
+
new_pair = (tokens[prev_pos], tokens[pos])
|
|
197
|
+
if new_pair in self.merges:
|
|
198
|
+
heapq.heappush(heap, (self.merges[new_pair], prev_pos))
|
|
199
|
+
|
|
200
|
+
if next_next_pos < n:
|
|
201
|
+
new_pair = (tokens[pos], tokens[next_next_pos])
|
|
202
|
+
if new_pair in self.merges:
|
|
203
|
+
heapq.heappush(heap, (self.merges[new_pair], pos))
|
|
204
|
+
|
|
205
|
+
# Build result excluding deleted positions
|
|
206
|
+
return [tokens[i] for i in range(n) if not deleted[i]]
|
|
207
|
+
|
|
208
|
+
def encode(
|
|
209
|
+
self,
|
|
210
|
+
text: str,
|
|
211
|
+
add_special_tokens: bool = True,
|
|
212
|
+
max_length: Optional[int] = None,
|
|
213
|
+
truncation: bool = False,
|
|
214
|
+
padding: bool = False,
|
|
215
|
+
) -> List[int]:
|
|
216
|
+
"""
|
|
217
|
+
Encode text to token IDs.
|
|
218
|
+
|
|
219
|
+
Args:
|
|
220
|
+
text: Input text
|
|
221
|
+
add_special_tokens: Add BOS/EOS tokens
|
|
222
|
+
max_length: Maximum length (truncate if longer)
|
|
223
|
+
truncation: Whether to truncate
|
|
224
|
+
padding: Whether to pad to max_length
|
|
225
|
+
|
|
226
|
+
Returns:
|
|
227
|
+
List of token IDs
|
|
228
|
+
"""
|
|
229
|
+
# Convert to bytes
|
|
230
|
+
byte_ids = self._text_to_bytes(text)
|
|
231
|
+
|
|
232
|
+
# Apply BPE merges
|
|
233
|
+
token_ids = self._apply_merges(byte_ids)
|
|
234
|
+
|
|
235
|
+
# Add special tokens
|
|
236
|
+
if add_special_tokens:
|
|
237
|
+
token_ids = [self.BOS_ID] + token_ids + [self.EOS_ID]
|
|
238
|
+
|
|
239
|
+
# Truncation
|
|
240
|
+
if truncation and max_length and len(token_ids) > max_length:
|
|
241
|
+
token_ids = token_ids[:max_length]
|
|
242
|
+
|
|
243
|
+
# Padding
|
|
244
|
+
if padding and max_length and len(token_ids) < max_length:
|
|
245
|
+
token_ids = token_ids + [self.PAD_ID] * (max_length - len(token_ids))
|
|
246
|
+
|
|
247
|
+
self.total_tokens_processed += len(token_ids)
|
|
248
|
+
return token_ids
|
|
249
|
+
|
|
250
|
+
def _decode_token(self, token_id: int) -> bytes:
|
|
251
|
+
"""Decode a single token ID to bytes."""
|
|
252
|
+
# Special tokens
|
|
253
|
+
if token_id in [self.PAD_ID, self.BOS_ID, self.EOS_ID, self.UNK_ID]:
|
|
254
|
+
return b''
|
|
255
|
+
|
|
256
|
+
# Byte token
|
|
257
|
+
if token_id in self.id_to_byte:
|
|
258
|
+
return bytes([self.id_to_byte[token_id]])
|
|
259
|
+
|
|
260
|
+
# Merged token - recursively decode
|
|
261
|
+
if token_id in self.merge_to_tokens:
|
|
262
|
+
t1, t2 = self.merge_to_tokens[token_id]
|
|
263
|
+
return self._decode_token(t1) + self._decode_token(t2)
|
|
264
|
+
|
|
265
|
+
# Unknown token - this should NOT happen in normal operation
|
|
266
|
+
# If we get here, the model output a token ID beyond current_vocab_size
|
|
267
|
+
# This is a bug in the generation code (should be masking invalid tokens)
|
|
268
|
+
logger.warning(f"Unknown token ID {token_id} (vocab_size={self.current_vocab_size}) - using UNK")
|
|
269
|
+
return b'<unk>'
|
|
270
|
+
|
|
271
|
+
def decode(
|
|
272
|
+
self,
|
|
273
|
+
token_ids: List[int],
|
|
274
|
+
skip_special_tokens: bool = True
|
|
275
|
+
) -> str:
|
|
276
|
+
"""
|
|
277
|
+
Decode token IDs to text.
|
|
278
|
+
|
|
279
|
+
Args:
|
|
280
|
+
token_ids: List of token IDs
|
|
281
|
+
skip_special_tokens: Skip special tokens in output
|
|
282
|
+
|
|
283
|
+
Returns:
|
|
284
|
+
Decoded text
|
|
285
|
+
"""
|
|
286
|
+
byte_sequence = b''
|
|
287
|
+
|
|
288
|
+
for tid in token_ids:
|
|
289
|
+
if skip_special_tokens and tid in [self.PAD_ID, self.BOS_ID, self.EOS_ID, self.UNK_ID]:
|
|
290
|
+
continue
|
|
291
|
+
byte_sequence += self._decode_token(tid)
|
|
292
|
+
|
|
293
|
+
# Decode UTF-8, replacing errors
|
|
294
|
+
return byte_sequence.decode('utf-8', errors='replace')
|
|
295
|
+
|
|
296
|
+
def learn_merges(self, texts: List[str], num_merges: int = 1000, min_frequency: int = 2):
|
|
297
|
+
"""
|
|
298
|
+
Learn new BPE merges from training data using an optimized algorithm.
|
|
299
|
+
|
|
300
|
+
This uses incremental pair counting with a heap for O(n log n) performance
|
|
301
|
+
instead of the naive O(n² × m) algorithm.
|
|
302
|
+
|
|
303
|
+
Args:
|
|
304
|
+
texts: List of training texts
|
|
305
|
+
num_merges: Number of new merges to learn
|
|
306
|
+
min_frequency: Minimum pair frequency to create merge
|
|
307
|
+
"""
|
|
308
|
+
import heapq
|
|
309
|
+
|
|
310
|
+
if self.next_merge_id + num_merges > self.vocab_size:
|
|
311
|
+
num_merges = self.vocab_size - self.next_merge_id
|
|
312
|
+
if num_merges <= 0:
|
|
313
|
+
logger.warning("Vocabulary is full, cannot learn more merges")
|
|
314
|
+
return
|
|
315
|
+
|
|
316
|
+
logger.info(f"Tokenizing {len(texts)} texts...")
|
|
317
|
+
|
|
318
|
+
# Tokenize all texts to current vocabulary
|
|
319
|
+
# Use a word-based approach: split by whitespace first, then BPE within words
|
|
320
|
+
# This is much more efficient and produces better tokens
|
|
321
|
+
word_freq: Counter = Counter()
|
|
322
|
+
for text in texts:
|
|
323
|
+
# Split into words (preserve some punctuation patterns)
|
|
324
|
+
words = re.findall(r'\S+|\s+', text)
|
|
325
|
+
for word in words:
|
|
326
|
+
if word.strip(): # Skip pure whitespace
|
|
327
|
+
word_freq[word] += 1
|
|
328
|
+
|
|
329
|
+
logger.info(f"Found {len(word_freq)} unique words")
|
|
330
|
+
|
|
331
|
+
# Convert words to byte sequences with frequency
|
|
332
|
+
# Format: {word_tuple: frequency} where word_tuple is tuple of token ids
|
|
333
|
+
word_tokens: Dict[tuple, int] = {}
|
|
334
|
+
for word, freq in word_freq.items():
|
|
335
|
+
byte_ids = tuple(self._text_to_bytes(word))
|
|
336
|
+
token_ids = tuple(self._apply_merges(list(byte_ids)))
|
|
337
|
+
if token_ids in word_tokens:
|
|
338
|
+
word_tokens[token_ids] += freq
|
|
339
|
+
else:
|
|
340
|
+
word_tokens[token_ids] = freq
|
|
341
|
+
|
|
342
|
+
logger.info(f"Converted to {len(word_tokens)} unique token sequences")
|
|
343
|
+
|
|
344
|
+
# Build initial pair counts
|
|
345
|
+
pair_counts: Counter = Counter()
|
|
346
|
+
# Track which words contain which pairs for efficient updates
|
|
347
|
+
pair_to_words: Dict[Tuple[int, int], Set[tuple]] = {}
|
|
348
|
+
|
|
349
|
+
for word, freq in word_tokens.items():
|
|
350
|
+
for i in range(len(word) - 1):
|
|
351
|
+
pair = (word[i], word[i + 1])
|
|
352
|
+
pair_counts[pair] += freq
|
|
353
|
+
if pair not in pair_to_words:
|
|
354
|
+
pair_to_words[pair] = set()
|
|
355
|
+
pair_to_words[pair].add(word)
|
|
356
|
+
|
|
357
|
+
logger.info(f"Initial pair count: {len(pair_counts)} unique pairs")
|
|
358
|
+
|
|
359
|
+
merges_learned = 0
|
|
360
|
+
log_interval = max(1, num_merges // 20) # Log ~20 times during learning
|
|
361
|
+
|
|
362
|
+
while merges_learned < num_merges:
|
|
363
|
+
if not pair_counts:
|
|
364
|
+
logger.info("No more pairs to merge")
|
|
365
|
+
break
|
|
366
|
+
|
|
367
|
+
# Find most frequent pair
|
|
368
|
+
best_pair, count = pair_counts.most_common(1)[0]
|
|
369
|
+
|
|
370
|
+
if count < min_frequency:
|
|
371
|
+
logger.info(f"Best pair frequency {count} below minimum {min_frequency}")
|
|
372
|
+
break
|
|
373
|
+
|
|
374
|
+
# Create new merge
|
|
375
|
+
new_id = self.next_merge_id
|
|
376
|
+
self.merges[best_pair] = new_id
|
|
377
|
+
self.merge_to_tokens[new_id] = best_pair
|
|
378
|
+
self.next_merge_id += 1
|
|
379
|
+
merges_learned += 1
|
|
380
|
+
|
|
381
|
+
if merges_learned % log_interval == 0:
|
|
382
|
+
logger.info(f" Learned {merges_learned}/{num_merges} merges, best pair freq={count}")
|
|
383
|
+
|
|
384
|
+
# Update word_tokens and pair_counts incrementally
|
|
385
|
+
words_to_update = pair_to_words.get(best_pair, set()).copy()
|
|
386
|
+
|
|
387
|
+
# Remove the merged pair from counts
|
|
388
|
+
del pair_counts[best_pair]
|
|
389
|
+
if best_pair in pair_to_words:
|
|
390
|
+
del pair_to_words[best_pair]
|
|
391
|
+
|
|
392
|
+
for old_word in words_to_update:
|
|
393
|
+
if old_word not in word_tokens:
|
|
394
|
+
continue
|
|
395
|
+
|
|
396
|
+
freq = word_tokens[old_word]
|
|
397
|
+
|
|
398
|
+
# Remove old pair counts for this word
|
|
399
|
+
for i in range(len(old_word) - 1):
|
|
400
|
+
pair = (old_word[i], old_word[i + 1])
|
|
401
|
+
if pair in pair_counts:
|
|
402
|
+
pair_counts[pair] -= freq
|
|
403
|
+
if pair_counts[pair] <= 0:
|
|
404
|
+
del pair_counts[pair]
|
|
405
|
+
if pair in pair_to_words and old_word in pair_to_words[pair]:
|
|
406
|
+
pair_to_words[pair].discard(old_word)
|
|
407
|
+
|
|
408
|
+
# Apply merge to create new word
|
|
409
|
+
new_word = []
|
|
410
|
+
i = 0
|
|
411
|
+
while i < len(old_word):
|
|
412
|
+
if i < len(old_word) - 1 and (old_word[i], old_word[i + 1]) == best_pair:
|
|
413
|
+
new_word.append(new_id)
|
|
414
|
+
i += 2
|
|
415
|
+
else:
|
|
416
|
+
new_word.append(old_word[i])
|
|
417
|
+
i += 1
|
|
418
|
+
new_word = tuple(new_word)
|
|
419
|
+
|
|
420
|
+
# Update word_tokens
|
|
421
|
+
del word_tokens[old_word]
|
|
422
|
+
if new_word in word_tokens:
|
|
423
|
+
word_tokens[new_word] += freq
|
|
424
|
+
else:
|
|
425
|
+
word_tokens[new_word] = freq
|
|
426
|
+
|
|
427
|
+
# Add new pair counts for this word
|
|
428
|
+
for i in range(len(new_word) - 1):
|
|
429
|
+
pair = (new_word[i], new_word[i + 1])
|
|
430
|
+
pair_counts[pair] += freq
|
|
431
|
+
if pair not in pair_to_words:
|
|
432
|
+
pair_to_words[pair] = set()
|
|
433
|
+
pair_to_words[pair].add(new_word)
|
|
434
|
+
|
|
435
|
+
logger.info(f"Learned {merges_learned} new merges, vocab size now {len(self)}")
|
|
436
|
+
|
|
437
|
+
def batch_encode(
|
|
438
|
+
self,
|
|
439
|
+
texts: List[str],
|
|
440
|
+
max_length: Optional[int] = None,
|
|
441
|
+
padding: bool = True,
|
|
442
|
+
truncation: bool = True,
|
|
443
|
+
) -> Dict[str, List[List[int]]]:
|
|
444
|
+
"""
|
|
445
|
+
Encode a batch of texts.
|
|
446
|
+
|
|
447
|
+
Returns:
|
|
448
|
+
Dict with 'input_ids' and 'attention_mask'
|
|
449
|
+
"""
|
|
450
|
+
input_ids = []
|
|
451
|
+
attention_mask = []
|
|
452
|
+
|
|
453
|
+
for text in texts:
|
|
454
|
+
ids = self.encode(text, max_length=max_length, truncation=truncation, padding=padding)
|
|
455
|
+
input_ids.append(ids)
|
|
456
|
+
attention_mask.append([1 if tid != self.PAD_ID else 0 for tid in ids])
|
|
457
|
+
|
|
458
|
+
return {"input_ids": input_ids, "attention_mask": attention_mask}
|
|
459
|
+
|
|
460
|
+
def save(self, path: str):
|
|
461
|
+
"""
|
|
462
|
+
Save tokenizer to a JSON file.
|
|
463
|
+
|
|
464
|
+
Args:
|
|
465
|
+
path: Path to save the tokenizer JSON file (must end with .json)
|
|
466
|
+
"""
|
|
467
|
+
if not path.endswith('.json'):
|
|
468
|
+
path = path + '.json'
|
|
469
|
+
|
|
470
|
+
config = {
|
|
471
|
+
"vocab_size": self.vocab_size,
|
|
472
|
+
"next_merge_id": self.next_merge_id,
|
|
473
|
+
"total_tokens_processed": self.total_tokens_processed,
|
|
474
|
+
"sources_contributed": self.sources_contributed,
|
|
475
|
+
# Convert tuple keys to strings for JSON
|
|
476
|
+
"merges": {f"{k[0]}_{k[1]}": v for k, v in self.merges.items()},
|
|
477
|
+
}
|
|
478
|
+
|
|
479
|
+
# Ensure parent directory exists
|
|
480
|
+
parent_dir = os.path.dirname(path)
|
|
481
|
+
if parent_dir:
|
|
482
|
+
os.makedirs(parent_dir, exist_ok=True)
|
|
483
|
+
|
|
484
|
+
with open(path, "w") as f:
|
|
485
|
+
json.dump(config, f, indent=2)
|
|
486
|
+
|
|
487
|
+
logger.info(f"Tokenizer saved to {path} ({self.current_vocab_size} tokens, {len(self.merges)} merges)")
|
|
488
|
+
|
|
489
|
+
@classmethod
|
|
490
|
+
def load(cls, path: str) -> 'NeuroTokenizer':
|
|
491
|
+
"""
|
|
492
|
+
Load tokenizer from a JSON file.
|
|
493
|
+
|
|
494
|
+
Args:
|
|
495
|
+
path: Path to the tokenizer JSON file
|
|
496
|
+
|
|
497
|
+
Returns:
|
|
498
|
+
Loaded NeuroTokenizer instance
|
|
499
|
+
"""
|
|
500
|
+
if not path.endswith('.json'):
|
|
501
|
+
path = path + '.json'
|
|
502
|
+
|
|
503
|
+
if not os.path.exists(path):
|
|
504
|
+
logger.warning(f"No tokenizer found at {path}, creating new one")
|
|
505
|
+
return cls()
|
|
506
|
+
|
|
507
|
+
with open(path) as f:
|
|
508
|
+
config = json.load(f)
|
|
509
|
+
|
|
510
|
+
tokenizer = cls(vocab_size=config.get("vocab_size", 10_000_000)) # Default to unlimited
|
|
511
|
+
tokenizer.next_merge_id = config.get("next_merge_id", cls.MERGE_OFFSET)
|
|
512
|
+
tokenizer.total_tokens_processed = config.get("total_tokens_processed", 0)
|
|
513
|
+
tokenizer.sources_contributed = config.get("sources_contributed", {})
|
|
514
|
+
|
|
515
|
+
# Restore merges
|
|
516
|
+
merges_data = config.get("merges", {})
|
|
517
|
+
for key_str, merged_id in merges_data.items():
|
|
518
|
+
t1, t2 = map(int, key_str.split("_"))
|
|
519
|
+
tokenizer.merges[(t1, t2)] = merged_id
|
|
520
|
+
tokenizer.merge_to_tokens[merged_id] = (t1, t2)
|
|
521
|
+
|
|
522
|
+
logger.info(f"Tokenizer loaded from {path} ({tokenizer.current_vocab_size} tokens, {len(tokenizer.merges)} merges)")
|
|
523
|
+
return tokenizer
|
|
524
|
+
|
|
525
|
+
def __len__(self) -> int:
|
|
526
|
+
"""Return current vocabulary size (valid token count)."""
|
|
527
|
+
# This should match current_vocab_size for consistency
|
|
528
|
+
# Base: 266 tokens (IDs 0-265: 10 special/reserved + 256 bytes)
|
|
529
|
+
# Plus: learned merges
|
|
530
|
+
return self.current_vocab_size
|
|
531
|
+
|
|
532
|
+
def has_source_contributed(self, source_name: str) -> bool:
|
|
533
|
+
"""Check if a data source has already contributed merges."""
|
|
534
|
+
return source_name in self.sources_contributed
|
|
535
|
+
|
|
536
|
+
def record_source_contribution(self, source_name: str, num_merges: int):
|
|
537
|
+
"""Record that a source has contributed merges."""
|
|
538
|
+
self.sources_contributed[source_name] = num_merges
|
|
539
|
+
logger.info(f"Recorded contribution: '{source_name}' added {num_merges} merges")
|
|
540
|
+
|
|
541
|
+
def get_stats(self) -> Dict:
|
|
542
|
+
"""Get tokenizer statistics."""
|
|
543
|
+
return {
|
|
544
|
+
"vocab_size": self.vocab_size,
|
|
545
|
+
"current_vocab": len(self),
|
|
546
|
+
"num_merges": len(self.merges),
|
|
547
|
+
"total_tokens_processed": self.total_tokens_processed,
|
|
548
|
+
"can_learn_more": self.next_merge_id < self.vocab_size,
|
|
549
|
+
"sources_contributed": self.sources_contributed,
|
|
550
|
+
}
|
|
551
|
+
|
|
552
|
+
|
|
553
|
+
# Global tokenizer instance
|
|
554
|
+
_tokenizer: Optional[NeuroTokenizer] = None
|
|
555
|
+
|
|
556
|
+
|
|
557
|
+
def get_neuro_tokenizer() -> NeuroTokenizer:
|
|
558
|
+
"""Get the global NeuroLLM tokenizer."""
|
|
559
|
+
global _tokenizer
|
|
560
|
+
if _tokenizer is None:
|
|
561
|
+
_tokenizer = NeuroTokenizer()
|
|
562
|
+
return _tokenizer
|
|
563
|
+
|
|
564
|
+
|
|
565
|
+
def reset_tokenizer():
|
|
566
|
+
"""Reset the global tokenizer (for testing)."""
|
|
567
|
+
global _tokenizer
|
|
568
|
+
_tokenizer = None
|
|
@@ -0,0 +1,56 @@
|
|
|
1
|
+
# neuroshard/core/network/__init__.py
|
|
2
|
+
"""
|
|
3
|
+
Network components for NeuroShard.
|
|
4
|
+
|
|
5
|
+
- p2p: P2PManager
|
|
6
|
+
- p2p_data: P2PDataManager
|
|
7
|
+
- dht: DHT, Node, RoutingTable
|
|
8
|
+
- dht_protocol: DHTProtocol
|
|
9
|
+
- dht_service: DHTService, DHTServiceMixin
|
|
10
|
+
- nat: NATTraverser
|
|
11
|
+
- nat_traversal: NATTraversalManager, STUNClient
|
|
12
|
+
- connection_pool: ConnectionPool, get_channel
|
|
13
|
+
- encrypted_channel: EncryptedPrompt
|
|
14
|
+
"""
|
|
15
|
+
|
|
16
|
+
# Lazy imports to avoid circular dependencies
|
|
17
|
+
__all__ = [
|
|
18
|
+
'P2PManager',
|
|
19
|
+
'P2PDataManager',
|
|
20
|
+
'DHT',
|
|
21
|
+
'DHTProtocol',
|
|
22
|
+
'DHTService',
|
|
23
|
+
'NATTraverser',
|
|
24
|
+
'NATTraversalManager',
|
|
25
|
+
'STUNClient',
|
|
26
|
+
'ConnectionPool',
|
|
27
|
+
'get_channel',
|
|
28
|
+
]
|
|
29
|
+
|
|
30
|
+
def __getattr__(name):
|
|
31
|
+
"""Lazy loading of submodules."""
|
|
32
|
+
if name == 'P2PManager':
|
|
33
|
+
from neuroshard.core.network.p2p import P2PManager
|
|
34
|
+
return P2PManager
|
|
35
|
+
elif name == 'P2PDataManager':
|
|
36
|
+
from neuroshard.core.network.p2p_data import P2PDataManager
|
|
37
|
+
return P2PDataManager
|
|
38
|
+
elif name in ('DHT', 'Node', 'RoutingTable', 'ID_BITS'):
|
|
39
|
+
from neuroshard.core.network import dht
|
|
40
|
+
return getattr(dht, name)
|
|
41
|
+
elif name == 'DHTProtocol':
|
|
42
|
+
from neuroshard.core.network.dht_protocol import DHTProtocol
|
|
43
|
+
return DHTProtocol
|
|
44
|
+
elif name in ('DHTService', 'DHTServiceMixin'):
|
|
45
|
+
from neuroshard.core.network import dht_service
|
|
46
|
+
return getattr(dht_service, name)
|
|
47
|
+
elif name == 'NATTraverser':
|
|
48
|
+
from neuroshard.core.network.nat import NATTraverser
|
|
49
|
+
return NATTraverser
|
|
50
|
+
elif name in ('NATTraversalManager', 'STUNClient', 'NATType', 'PeerConnectivity'):
|
|
51
|
+
from neuroshard.core.network import nat_traversal
|
|
52
|
+
return getattr(nat_traversal, name)
|
|
53
|
+
elif name in ('ConnectionPool', 'get_channel'):
|
|
54
|
+
from neuroshard.core.network import connection_pool
|
|
55
|
+
return getattr(connection_pool, name)
|
|
56
|
+
raise AttributeError(f"module 'neuroshard.core.network' has no attribute '{name}'")
|
|
@@ -0,0 +1,72 @@
|
|
|
1
|
+
import grpc
|
|
2
|
+
import time
|
|
3
|
+
from typing import Dict
|
|
4
|
+
import threading
|
|
5
|
+
|
|
6
|
+
class ConnectionPool:
|
|
7
|
+
_instance = None
|
|
8
|
+
_lock = threading.Lock()
|
|
9
|
+
|
|
10
|
+
def __new__(cls):
|
|
11
|
+
if cls._instance is None:
|
|
12
|
+
with cls._lock:
|
|
13
|
+
if cls._instance is None:
|
|
14
|
+
cls._instance = super(ConnectionPool, cls).__new__(cls)
|
|
15
|
+
cls._instance.channels = {} # url -> channel
|
|
16
|
+
cls._instance.last_used = {} # url -> timestamp
|
|
17
|
+
return cls._instance
|
|
18
|
+
|
|
19
|
+
def get_channel(self, address: str):
|
|
20
|
+
"""
|
|
21
|
+
Get an existing channel or create a new one.
|
|
22
|
+
address format: "ip:port"
|
|
23
|
+
"""
|
|
24
|
+
# Normalize address (remove http:// if present)
|
|
25
|
+
if address.startswith("http://"):
|
|
26
|
+
address = address.replace("http://", "")
|
|
27
|
+
if address.startswith("https://"):
|
|
28
|
+
address = address.replace("https://", "")
|
|
29
|
+
|
|
30
|
+
with self._lock:
|
|
31
|
+
if address in self.channels:
|
|
32
|
+
# Check if channel is active (simplified check)
|
|
33
|
+
self.last_used[address] = time.time()
|
|
34
|
+
return self.channels[address]
|
|
35
|
+
|
|
36
|
+
# Create new channel for P2P network
|
|
37
|
+
# Fast keepalive to detect dead nodes quickly in decentralized network
|
|
38
|
+
# IMPORTANT: Increase message size for activation tensors in pipeline training!
|
|
39
|
+
MAX_MESSAGE_SIZE = 64 * 1024 * 1024 # 64MB for large batches/sequences
|
|
40
|
+
options = [
|
|
41
|
+
('grpc.keepalive_time_ms', 30000), # Ping every 30 seconds
|
|
42
|
+
('grpc.keepalive_timeout_ms', 10000), # 10 second timeout
|
|
43
|
+
('grpc.keepalive_permit_without_calls', True), # Ping even when idle
|
|
44
|
+
('grpc.http2.max_pings_without_data', 0), # Unlimited pings
|
|
45
|
+
('grpc.max_receive_message_length', MAX_MESSAGE_SIZE), # For receiving responses
|
|
46
|
+
('grpc.max_send_message_length', MAX_MESSAGE_SIZE), # For sending activations
|
|
47
|
+
]
|
|
48
|
+
channel = grpc.insecure_channel(address, options=options)
|
|
49
|
+
self.channels[address] = channel
|
|
50
|
+
self.last_used[address] = time.time()
|
|
51
|
+
return channel
|
|
52
|
+
|
|
53
|
+
def cleanup(self, max_idle_seconds=300):
|
|
54
|
+
"""Close channels idle for too long"""
|
|
55
|
+
now = time.time()
|
|
56
|
+
to_remove = []
|
|
57
|
+
with self._lock:
|
|
58
|
+
for addr, last_time in self.last_used.items():
|
|
59
|
+
if now - last_time > max_idle_seconds:
|
|
60
|
+
to_remove.append(addr)
|
|
61
|
+
|
|
62
|
+
for addr in to_remove:
|
|
63
|
+
print(f"Closing idle connection to {addr}")
|
|
64
|
+
self.channels[addr].close()
|
|
65
|
+
del self.channels[addr]
|
|
66
|
+
del self.last_used[addr]
|
|
67
|
+
|
|
68
|
+
# Global accessor
|
|
69
|
+
def get_channel(address: str):
|
|
70
|
+
pool = ConnectionPool()
|
|
71
|
+
return pool.get_channel(address)
|
|
72
|
+
|