nexaroa 0.0.111__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (78) hide show
  1. neuroshard/__init__.py +93 -0
  2. neuroshard/__main__.py +4 -0
  3. neuroshard/cli.py +466 -0
  4. neuroshard/core/__init__.py +92 -0
  5. neuroshard/core/consensus/verifier.py +252 -0
  6. neuroshard/core/crypto/__init__.py +20 -0
  7. neuroshard/core/crypto/ecdsa.py +392 -0
  8. neuroshard/core/economics/__init__.py +52 -0
  9. neuroshard/core/economics/constants.py +387 -0
  10. neuroshard/core/economics/ledger.py +2111 -0
  11. neuroshard/core/economics/market.py +975 -0
  12. neuroshard/core/economics/wallet.py +168 -0
  13. neuroshard/core/governance/__init__.py +74 -0
  14. neuroshard/core/governance/proposal.py +561 -0
  15. neuroshard/core/governance/registry.py +545 -0
  16. neuroshard/core/governance/versioning.py +332 -0
  17. neuroshard/core/governance/voting.py +453 -0
  18. neuroshard/core/model/__init__.py +30 -0
  19. neuroshard/core/model/dynamic.py +4186 -0
  20. neuroshard/core/model/llm.py +905 -0
  21. neuroshard/core/model/registry.py +164 -0
  22. neuroshard/core/model/scaler.py +387 -0
  23. neuroshard/core/model/tokenizer.py +568 -0
  24. neuroshard/core/network/__init__.py +56 -0
  25. neuroshard/core/network/connection_pool.py +72 -0
  26. neuroshard/core/network/dht.py +130 -0
  27. neuroshard/core/network/dht_plan.py +55 -0
  28. neuroshard/core/network/dht_proof_store.py +516 -0
  29. neuroshard/core/network/dht_protocol.py +261 -0
  30. neuroshard/core/network/dht_service.py +506 -0
  31. neuroshard/core/network/encrypted_channel.py +141 -0
  32. neuroshard/core/network/nat.py +201 -0
  33. neuroshard/core/network/nat_traversal.py +695 -0
  34. neuroshard/core/network/p2p.py +929 -0
  35. neuroshard/core/network/p2p_data.py +150 -0
  36. neuroshard/core/swarm/__init__.py +106 -0
  37. neuroshard/core/swarm/aggregation.py +729 -0
  38. neuroshard/core/swarm/buffers.py +643 -0
  39. neuroshard/core/swarm/checkpoint.py +709 -0
  40. neuroshard/core/swarm/compute.py +624 -0
  41. neuroshard/core/swarm/diloco.py +844 -0
  42. neuroshard/core/swarm/factory.py +1288 -0
  43. neuroshard/core/swarm/heartbeat.py +669 -0
  44. neuroshard/core/swarm/logger.py +487 -0
  45. neuroshard/core/swarm/router.py +658 -0
  46. neuroshard/core/swarm/service.py +640 -0
  47. neuroshard/core/training/__init__.py +29 -0
  48. neuroshard/core/training/checkpoint.py +600 -0
  49. neuroshard/core/training/distributed.py +1602 -0
  50. neuroshard/core/training/global_tracker.py +617 -0
  51. neuroshard/core/training/production.py +276 -0
  52. neuroshard/governance_cli.py +729 -0
  53. neuroshard/grpc_server.py +895 -0
  54. neuroshard/runner.py +3223 -0
  55. neuroshard/sdk/__init__.py +92 -0
  56. neuroshard/sdk/client.py +990 -0
  57. neuroshard/sdk/errors.py +101 -0
  58. neuroshard/sdk/types.py +282 -0
  59. neuroshard/tracker/__init__.py +0 -0
  60. neuroshard/tracker/server.py +864 -0
  61. neuroshard/ui/__init__.py +0 -0
  62. neuroshard/ui/app.py +102 -0
  63. neuroshard/ui/templates/index.html +1052 -0
  64. neuroshard/utils/__init__.py +0 -0
  65. neuroshard/utils/autostart.py +81 -0
  66. neuroshard/utils/hardware.py +121 -0
  67. neuroshard/utils/serialization.py +90 -0
  68. neuroshard/version.py +1 -0
  69. nexaroa-0.0.111.dist-info/METADATA +283 -0
  70. nexaroa-0.0.111.dist-info/RECORD +78 -0
  71. nexaroa-0.0.111.dist-info/WHEEL +5 -0
  72. nexaroa-0.0.111.dist-info/entry_points.txt +4 -0
  73. nexaroa-0.0.111.dist-info/licenses/LICENSE +190 -0
  74. nexaroa-0.0.111.dist-info/top_level.txt +2 -0
  75. protos/__init__.py +0 -0
  76. protos/neuroshard.proto +651 -0
  77. protos/neuroshard_pb2.py +160 -0
  78. protos/neuroshard_pb2_grpc.py +1298 -0
@@ -0,0 +1,568 @@
1
+ """
2
+ NeuroLLM Tokenizer
3
+
4
+ A BPE (Byte Pair Encoding) tokenizer for NeuroLLM that is trained from scratch
5
+ by the network itself.
6
+ tokenizer - it's a truly decentralized tokenizer that grows with the network.
7
+
8
+ The tokenizer starts with a base vocabulary (bytes + special tokens) and learns
9
+ new subword units as more training data is contributed by the network.
10
+
11
+ Features:
12
+ - Pure BPE implementation (no external dependencies for core functionality)
13
+ - Starts with byte-level vocabulary (256 tokens)
14
+ - Learns merges from contributed training data
15
+ - Can be updated through network consensus
16
+ - Fully serializable for checkpoint distribution
17
+ """
18
+
19
+ import os
20
+ import json
21
+ import logging
22
+ import re
23
+ from typing import List, Dict, Optional, Tuple, Set
24
+ from collections import Counter
25
+ from pathlib import Path
26
+
27
+ logger = logging.getLogger(__name__)
28
+
29
+
30
+ class NeuroTokenizer:
31
+ """
32
+ A truly decentralized BPE tokenizer for NeuroLLM.
33
+
34
+ Unlike traditional tokenizers that are pre-trained on massive corpora,
35
+ this tokenizer starts with a minimal vocabulary and learns from the
36
+ training data contributed by network participants.
37
+ """
38
+
39
+ # Special tokens (reserved IDs 0-9)
40
+ PAD_TOKEN = "<|pad|>"
41
+ BOS_TOKEN = "<|bos|>"
42
+ EOS_TOKEN = "<|eos|>"
43
+ UNK_TOKEN = "<|unk|>"
44
+
45
+ PAD_ID = 0
46
+ BOS_ID = 1
47
+ EOS_ID = 2
48
+ UNK_ID = 3
49
+
50
+ # Byte tokens start at ID 10 (256 bytes = IDs 10-265)
51
+ BYTE_OFFSET = 10
52
+
53
+ # Learned merges start at ID 266
54
+ MERGE_OFFSET = 266
55
+
56
+ def __init__(self, vocab_size: int = 10_000_000): # 10M - effectively unlimited
57
+ """
58
+ Initialize the NeuroLLM tokenizer.
59
+
60
+ Args:
61
+ vocab_size: Maximum vocabulary size. Default 10M is effectively unlimited.
62
+ The tokenizer can grow as large as needed - memory is the only
63
+ real constraint. For reference: GPT-4 ~100K, most LLMs ~32K-256K.
64
+ """
65
+ self.vocab_size = vocab_size
66
+
67
+ # Core vocabulary
68
+ self.special_tokens = {
69
+ self.PAD_TOKEN: self.PAD_ID,
70
+ self.BOS_TOKEN: self.BOS_ID,
71
+ self.EOS_TOKEN: self.EOS_ID,
72
+ self.UNK_TOKEN: self.UNK_ID,
73
+ }
74
+
75
+ # Byte vocabulary (256 bytes)
76
+ self.byte_to_id = {i: i + self.BYTE_OFFSET for i in range(256)}
77
+ self.id_to_byte = {v: k for k, v in self.byte_to_id.items()}
78
+
79
+ # Learned BPE merges: (token1, token2) -> merged_token_id
80
+ self.merges: Dict[Tuple[int, int], int] = {}
81
+ self.merge_to_tokens: Dict[int, Tuple[int, int]] = {} # Reverse lookup
82
+
83
+ # Token to string (for decoding merged tokens)
84
+ self.id_to_string: Dict[int, str] = {}
85
+
86
+ # Next available ID for new merges
87
+ self.next_merge_id = self.MERGE_OFFSET
88
+
89
+ # Track which data sources have contributed merges
90
+ # Format: {"source_name": num_merges_contributed}
91
+ self.sources_contributed: Dict[str, int] = {}
92
+
93
+ # Statistics
94
+ self.total_tokens_processed = 0
95
+
96
+ logger.info(f"NeuroTokenizer initialized with vocab_size={vocab_size}")
97
+
98
+ @property
99
+ def pad_token_id(self) -> int:
100
+ return self.PAD_ID
101
+
102
+ @property
103
+ def bos_token_id(self) -> int:
104
+ return self.BOS_ID
105
+
106
+ @property
107
+ def eos_token_id(self) -> int:
108
+ return self.EOS_ID
109
+
110
+ @property
111
+ def unk_token_id(self) -> int:
112
+ return self.UNK_ID
113
+
114
+ @property
115
+ def current_vocab_size(self) -> int:
116
+ """
117
+ The current vocabulary size (valid token IDs: 0 to current_vocab_size-1).
118
+
119
+ This grows as the tokenizer learns BPE merges:
120
+ - Initial: 266 (10 special + 256 bytes)
121
+ - After learning: 266 + num_merges
122
+ - Maximum: vocab_size (10M default - effectively unlimited)
123
+
124
+ IMPORTANT: During inference, only tokens 0 to current_vocab_size-1 are valid.
125
+ Tokens beyond this have no learned representation and should not be sampled.
126
+ """
127
+ return self.next_merge_id
128
+
129
+ def _text_to_bytes(self, text: str) -> List[int]:
130
+ """Convert text to byte-level token IDs."""
131
+ return [self.byte_to_id[b] for b in text.encode('utf-8')]
132
+
133
+ def _apply_merges(self, token_ids: List[int]) -> List[int]:
134
+ """
135
+ Apply learned BPE merges to a sequence of token IDs.
136
+
137
+ OPTIMIZED: Uses heap-based approach for O(n log n) instead of O(n²).
138
+ Merges are applied in priority order (lower merge ID = higher priority).
139
+ """
140
+ if not self.merges or len(token_ids) <= 1:
141
+ return token_ids
142
+
143
+ import heapq
144
+
145
+ # Convert to list for in-place modification
146
+ tokens = list(token_ids)
147
+ n = len(tokens)
148
+
149
+ # Track which positions are "deleted" (merged into previous)
150
+ deleted = [False] * n
151
+
152
+ # Build initial heap of mergeable pairs: (merge_id, position)
153
+ # Lower merge_id = higher priority (learned earlier = more frequent)
154
+ heap = []
155
+ for i in range(n - 1):
156
+ pair = (tokens[i], tokens[i + 1])
157
+ if pair in self.merges:
158
+ heapq.heappush(heap, (self.merges[pair], i))
159
+
160
+ while heap:
161
+ merge_id, pos = heapq.heappop(heap)
162
+
163
+ # Skip if position was already processed
164
+ if pos >= n - 1 or deleted[pos]:
165
+ continue
166
+
167
+ # Find actual next non-deleted position
168
+ next_pos = pos + 1
169
+ while next_pos < n and deleted[next_pos]:
170
+ next_pos += 1
171
+
172
+ if next_pos >= n:
173
+ continue
174
+
175
+ # Check if this merge still applies
176
+ pair = (tokens[pos], tokens[next_pos])
177
+ if pair not in self.merges or self.merges[pair] != merge_id:
178
+ continue
179
+
180
+ # Apply merge: replace token at pos, mark next_pos as deleted
181
+ tokens[pos] = merge_id
182
+ deleted[next_pos] = True
183
+
184
+ # Find previous non-deleted position
185
+ prev_pos = pos - 1
186
+ while prev_pos >= 0 and deleted[prev_pos]:
187
+ prev_pos -= 1
188
+
189
+ # Find next-next non-deleted position
190
+ next_next_pos = next_pos + 1
191
+ while next_next_pos < n and deleted[next_next_pos]:
192
+ next_next_pos += 1
193
+
194
+ # Add new potential merges to heap
195
+ if prev_pos >= 0:
196
+ new_pair = (tokens[prev_pos], tokens[pos])
197
+ if new_pair in self.merges:
198
+ heapq.heappush(heap, (self.merges[new_pair], prev_pos))
199
+
200
+ if next_next_pos < n:
201
+ new_pair = (tokens[pos], tokens[next_next_pos])
202
+ if new_pair in self.merges:
203
+ heapq.heappush(heap, (self.merges[new_pair], pos))
204
+
205
+ # Build result excluding deleted positions
206
+ return [tokens[i] for i in range(n) if not deleted[i]]
207
+
208
+ def encode(
209
+ self,
210
+ text: str,
211
+ add_special_tokens: bool = True,
212
+ max_length: Optional[int] = None,
213
+ truncation: bool = False,
214
+ padding: bool = False,
215
+ ) -> List[int]:
216
+ """
217
+ Encode text to token IDs.
218
+
219
+ Args:
220
+ text: Input text
221
+ add_special_tokens: Add BOS/EOS tokens
222
+ max_length: Maximum length (truncate if longer)
223
+ truncation: Whether to truncate
224
+ padding: Whether to pad to max_length
225
+
226
+ Returns:
227
+ List of token IDs
228
+ """
229
+ # Convert to bytes
230
+ byte_ids = self._text_to_bytes(text)
231
+
232
+ # Apply BPE merges
233
+ token_ids = self._apply_merges(byte_ids)
234
+
235
+ # Add special tokens
236
+ if add_special_tokens:
237
+ token_ids = [self.BOS_ID] + token_ids + [self.EOS_ID]
238
+
239
+ # Truncation
240
+ if truncation and max_length and len(token_ids) > max_length:
241
+ token_ids = token_ids[:max_length]
242
+
243
+ # Padding
244
+ if padding and max_length and len(token_ids) < max_length:
245
+ token_ids = token_ids + [self.PAD_ID] * (max_length - len(token_ids))
246
+
247
+ self.total_tokens_processed += len(token_ids)
248
+ return token_ids
249
+
250
+ def _decode_token(self, token_id: int) -> bytes:
251
+ """Decode a single token ID to bytes."""
252
+ # Special tokens
253
+ if token_id in [self.PAD_ID, self.BOS_ID, self.EOS_ID, self.UNK_ID]:
254
+ return b''
255
+
256
+ # Byte token
257
+ if token_id in self.id_to_byte:
258
+ return bytes([self.id_to_byte[token_id]])
259
+
260
+ # Merged token - recursively decode
261
+ if token_id in self.merge_to_tokens:
262
+ t1, t2 = self.merge_to_tokens[token_id]
263
+ return self._decode_token(t1) + self._decode_token(t2)
264
+
265
+ # Unknown token - this should NOT happen in normal operation
266
+ # If we get here, the model output a token ID beyond current_vocab_size
267
+ # This is a bug in the generation code (should be masking invalid tokens)
268
+ logger.warning(f"Unknown token ID {token_id} (vocab_size={self.current_vocab_size}) - using UNK")
269
+ return b'<unk>'
270
+
271
+ def decode(
272
+ self,
273
+ token_ids: List[int],
274
+ skip_special_tokens: bool = True
275
+ ) -> str:
276
+ """
277
+ Decode token IDs to text.
278
+
279
+ Args:
280
+ token_ids: List of token IDs
281
+ skip_special_tokens: Skip special tokens in output
282
+
283
+ Returns:
284
+ Decoded text
285
+ """
286
+ byte_sequence = b''
287
+
288
+ for tid in token_ids:
289
+ if skip_special_tokens and tid in [self.PAD_ID, self.BOS_ID, self.EOS_ID, self.UNK_ID]:
290
+ continue
291
+ byte_sequence += self._decode_token(tid)
292
+
293
+ # Decode UTF-8, replacing errors
294
+ return byte_sequence.decode('utf-8', errors='replace')
295
+
296
+ def learn_merges(self, texts: List[str], num_merges: int = 1000, min_frequency: int = 2):
297
+ """
298
+ Learn new BPE merges from training data using an optimized algorithm.
299
+
300
+ This uses incremental pair counting with a heap for O(n log n) performance
301
+ instead of the naive O(n² × m) algorithm.
302
+
303
+ Args:
304
+ texts: List of training texts
305
+ num_merges: Number of new merges to learn
306
+ min_frequency: Minimum pair frequency to create merge
307
+ """
308
+ import heapq
309
+
310
+ if self.next_merge_id + num_merges > self.vocab_size:
311
+ num_merges = self.vocab_size - self.next_merge_id
312
+ if num_merges <= 0:
313
+ logger.warning("Vocabulary is full, cannot learn more merges")
314
+ return
315
+
316
+ logger.info(f"Tokenizing {len(texts)} texts...")
317
+
318
+ # Tokenize all texts to current vocabulary
319
+ # Use a word-based approach: split by whitespace first, then BPE within words
320
+ # This is much more efficient and produces better tokens
321
+ word_freq: Counter = Counter()
322
+ for text in texts:
323
+ # Split into words (preserve some punctuation patterns)
324
+ words = re.findall(r'\S+|\s+', text)
325
+ for word in words:
326
+ if word.strip(): # Skip pure whitespace
327
+ word_freq[word] += 1
328
+
329
+ logger.info(f"Found {len(word_freq)} unique words")
330
+
331
+ # Convert words to byte sequences with frequency
332
+ # Format: {word_tuple: frequency} where word_tuple is tuple of token ids
333
+ word_tokens: Dict[tuple, int] = {}
334
+ for word, freq in word_freq.items():
335
+ byte_ids = tuple(self._text_to_bytes(word))
336
+ token_ids = tuple(self._apply_merges(list(byte_ids)))
337
+ if token_ids in word_tokens:
338
+ word_tokens[token_ids] += freq
339
+ else:
340
+ word_tokens[token_ids] = freq
341
+
342
+ logger.info(f"Converted to {len(word_tokens)} unique token sequences")
343
+
344
+ # Build initial pair counts
345
+ pair_counts: Counter = Counter()
346
+ # Track which words contain which pairs for efficient updates
347
+ pair_to_words: Dict[Tuple[int, int], Set[tuple]] = {}
348
+
349
+ for word, freq in word_tokens.items():
350
+ for i in range(len(word) - 1):
351
+ pair = (word[i], word[i + 1])
352
+ pair_counts[pair] += freq
353
+ if pair not in pair_to_words:
354
+ pair_to_words[pair] = set()
355
+ pair_to_words[pair].add(word)
356
+
357
+ logger.info(f"Initial pair count: {len(pair_counts)} unique pairs")
358
+
359
+ merges_learned = 0
360
+ log_interval = max(1, num_merges // 20) # Log ~20 times during learning
361
+
362
+ while merges_learned < num_merges:
363
+ if not pair_counts:
364
+ logger.info("No more pairs to merge")
365
+ break
366
+
367
+ # Find most frequent pair
368
+ best_pair, count = pair_counts.most_common(1)[0]
369
+
370
+ if count < min_frequency:
371
+ logger.info(f"Best pair frequency {count} below minimum {min_frequency}")
372
+ break
373
+
374
+ # Create new merge
375
+ new_id = self.next_merge_id
376
+ self.merges[best_pair] = new_id
377
+ self.merge_to_tokens[new_id] = best_pair
378
+ self.next_merge_id += 1
379
+ merges_learned += 1
380
+
381
+ if merges_learned % log_interval == 0:
382
+ logger.info(f" Learned {merges_learned}/{num_merges} merges, best pair freq={count}")
383
+
384
+ # Update word_tokens and pair_counts incrementally
385
+ words_to_update = pair_to_words.get(best_pair, set()).copy()
386
+
387
+ # Remove the merged pair from counts
388
+ del pair_counts[best_pair]
389
+ if best_pair in pair_to_words:
390
+ del pair_to_words[best_pair]
391
+
392
+ for old_word in words_to_update:
393
+ if old_word not in word_tokens:
394
+ continue
395
+
396
+ freq = word_tokens[old_word]
397
+
398
+ # Remove old pair counts for this word
399
+ for i in range(len(old_word) - 1):
400
+ pair = (old_word[i], old_word[i + 1])
401
+ if pair in pair_counts:
402
+ pair_counts[pair] -= freq
403
+ if pair_counts[pair] <= 0:
404
+ del pair_counts[pair]
405
+ if pair in pair_to_words and old_word in pair_to_words[pair]:
406
+ pair_to_words[pair].discard(old_word)
407
+
408
+ # Apply merge to create new word
409
+ new_word = []
410
+ i = 0
411
+ while i < len(old_word):
412
+ if i < len(old_word) - 1 and (old_word[i], old_word[i + 1]) == best_pair:
413
+ new_word.append(new_id)
414
+ i += 2
415
+ else:
416
+ new_word.append(old_word[i])
417
+ i += 1
418
+ new_word = tuple(new_word)
419
+
420
+ # Update word_tokens
421
+ del word_tokens[old_word]
422
+ if new_word in word_tokens:
423
+ word_tokens[new_word] += freq
424
+ else:
425
+ word_tokens[new_word] = freq
426
+
427
+ # Add new pair counts for this word
428
+ for i in range(len(new_word) - 1):
429
+ pair = (new_word[i], new_word[i + 1])
430
+ pair_counts[pair] += freq
431
+ if pair not in pair_to_words:
432
+ pair_to_words[pair] = set()
433
+ pair_to_words[pair].add(new_word)
434
+
435
+ logger.info(f"Learned {merges_learned} new merges, vocab size now {len(self)}")
436
+
437
+ def batch_encode(
438
+ self,
439
+ texts: List[str],
440
+ max_length: Optional[int] = None,
441
+ padding: bool = True,
442
+ truncation: bool = True,
443
+ ) -> Dict[str, List[List[int]]]:
444
+ """
445
+ Encode a batch of texts.
446
+
447
+ Returns:
448
+ Dict with 'input_ids' and 'attention_mask'
449
+ """
450
+ input_ids = []
451
+ attention_mask = []
452
+
453
+ for text in texts:
454
+ ids = self.encode(text, max_length=max_length, truncation=truncation, padding=padding)
455
+ input_ids.append(ids)
456
+ attention_mask.append([1 if tid != self.PAD_ID else 0 for tid in ids])
457
+
458
+ return {"input_ids": input_ids, "attention_mask": attention_mask}
459
+
460
+ def save(self, path: str):
461
+ """
462
+ Save tokenizer to a JSON file.
463
+
464
+ Args:
465
+ path: Path to save the tokenizer JSON file (must end with .json)
466
+ """
467
+ if not path.endswith('.json'):
468
+ path = path + '.json'
469
+
470
+ config = {
471
+ "vocab_size": self.vocab_size,
472
+ "next_merge_id": self.next_merge_id,
473
+ "total_tokens_processed": self.total_tokens_processed,
474
+ "sources_contributed": self.sources_contributed,
475
+ # Convert tuple keys to strings for JSON
476
+ "merges": {f"{k[0]}_{k[1]}": v for k, v in self.merges.items()},
477
+ }
478
+
479
+ # Ensure parent directory exists
480
+ parent_dir = os.path.dirname(path)
481
+ if parent_dir:
482
+ os.makedirs(parent_dir, exist_ok=True)
483
+
484
+ with open(path, "w") as f:
485
+ json.dump(config, f, indent=2)
486
+
487
+ logger.info(f"Tokenizer saved to {path} ({self.current_vocab_size} tokens, {len(self.merges)} merges)")
488
+
489
+ @classmethod
490
+ def load(cls, path: str) -> 'NeuroTokenizer':
491
+ """
492
+ Load tokenizer from a JSON file.
493
+
494
+ Args:
495
+ path: Path to the tokenizer JSON file
496
+
497
+ Returns:
498
+ Loaded NeuroTokenizer instance
499
+ """
500
+ if not path.endswith('.json'):
501
+ path = path + '.json'
502
+
503
+ if not os.path.exists(path):
504
+ logger.warning(f"No tokenizer found at {path}, creating new one")
505
+ return cls()
506
+
507
+ with open(path) as f:
508
+ config = json.load(f)
509
+
510
+ tokenizer = cls(vocab_size=config.get("vocab_size", 10_000_000)) # Default to unlimited
511
+ tokenizer.next_merge_id = config.get("next_merge_id", cls.MERGE_OFFSET)
512
+ tokenizer.total_tokens_processed = config.get("total_tokens_processed", 0)
513
+ tokenizer.sources_contributed = config.get("sources_contributed", {})
514
+
515
+ # Restore merges
516
+ merges_data = config.get("merges", {})
517
+ for key_str, merged_id in merges_data.items():
518
+ t1, t2 = map(int, key_str.split("_"))
519
+ tokenizer.merges[(t1, t2)] = merged_id
520
+ tokenizer.merge_to_tokens[merged_id] = (t1, t2)
521
+
522
+ logger.info(f"Tokenizer loaded from {path} ({tokenizer.current_vocab_size} tokens, {len(tokenizer.merges)} merges)")
523
+ return tokenizer
524
+
525
+ def __len__(self) -> int:
526
+ """Return current vocabulary size (valid token count)."""
527
+ # This should match current_vocab_size for consistency
528
+ # Base: 266 tokens (IDs 0-265: 10 special/reserved + 256 bytes)
529
+ # Plus: learned merges
530
+ return self.current_vocab_size
531
+
532
+ def has_source_contributed(self, source_name: str) -> bool:
533
+ """Check if a data source has already contributed merges."""
534
+ return source_name in self.sources_contributed
535
+
536
+ def record_source_contribution(self, source_name: str, num_merges: int):
537
+ """Record that a source has contributed merges."""
538
+ self.sources_contributed[source_name] = num_merges
539
+ logger.info(f"Recorded contribution: '{source_name}' added {num_merges} merges")
540
+
541
+ def get_stats(self) -> Dict:
542
+ """Get tokenizer statistics."""
543
+ return {
544
+ "vocab_size": self.vocab_size,
545
+ "current_vocab": len(self),
546
+ "num_merges": len(self.merges),
547
+ "total_tokens_processed": self.total_tokens_processed,
548
+ "can_learn_more": self.next_merge_id < self.vocab_size,
549
+ "sources_contributed": self.sources_contributed,
550
+ }
551
+
552
+
553
+ # Global tokenizer instance
554
+ _tokenizer: Optional[NeuroTokenizer] = None
555
+
556
+
557
+ def get_neuro_tokenizer() -> NeuroTokenizer:
558
+ """Get the global NeuroLLM tokenizer."""
559
+ global _tokenizer
560
+ if _tokenizer is None:
561
+ _tokenizer = NeuroTokenizer()
562
+ return _tokenizer
563
+
564
+
565
+ def reset_tokenizer():
566
+ """Reset the global tokenizer (for testing)."""
567
+ global _tokenizer
568
+ _tokenizer = None
@@ -0,0 +1,56 @@
1
+ # neuroshard/core/network/__init__.py
2
+ """
3
+ Network components for NeuroShard.
4
+
5
+ - p2p: P2PManager
6
+ - p2p_data: P2PDataManager
7
+ - dht: DHT, Node, RoutingTable
8
+ - dht_protocol: DHTProtocol
9
+ - dht_service: DHTService, DHTServiceMixin
10
+ - nat: NATTraverser
11
+ - nat_traversal: NATTraversalManager, STUNClient
12
+ - connection_pool: ConnectionPool, get_channel
13
+ - encrypted_channel: EncryptedPrompt
14
+ """
15
+
16
+ # Lazy imports to avoid circular dependencies
17
+ __all__ = [
18
+ 'P2PManager',
19
+ 'P2PDataManager',
20
+ 'DHT',
21
+ 'DHTProtocol',
22
+ 'DHTService',
23
+ 'NATTraverser',
24
+ 'NATTraversalManager',
25
+ 'STUNClient',
26
+ 'ConnectionPool',
27
+ 'get_channel',
28
+ ]
29
+
30
+ def __getattr__(name):
31
+ """Lazy loading of submodules."""
32
+ if name == 'P2PManager':
33
+ from neuroshard.core.network.p2p import P2PManager
34
+ return P2PManager
35
+ elif name == 'P2PDataManager':
36
+ from neuroshard.core.network.p2p_data import P2PDataManager
37
+ return P2PDataManager
38
+ elif name in ('DHT', 'Node', 'RoutingTable', 'ID_BITS'):
39
+ from neuroshard.core.network import dht
40
+ return getattr(dht, name)
41
+ elif name == 'DHTProtocol':
42
+ from neuroshard.core.network.dht_protocol import DHTProtocol
43
+ return DHTProtocol
44
+ elif name in ('DHTService', 'DHTServiceMixin'):
45
+ from neuroshard.core.network import dht_service
46
+ return getattr(dht_service, name)
47
+ elif name == 'NATTraverser':
48
+ from neuroshard.core.network.nat import NATTraverser
49
+ return NATTraverser
50
+ elif name in ('NATTraversalManager', 'STUNClient', 'NATType', 'PeerConnectivity'):
51
+ from neuroshard.core.network import nat_traversal
52
+ return getattr(nat_traversal, name)
53
+ elif name in ('ConnectionPool', 'get_channel'):
54
+ from neuroshard.core.network import connection_pool
55
+ return getattr(connection_pool, name)
56
+ raise AttributeError(f"module 'neuroshard.core.network' has no attribute '{name}'")
@@ -0,0 +1,72 @@
1
+ import grpc
2
+ import time
3
+ from typing import Dict
4
+ import threading
5
+
6
+ class ConnectionPool:
7
+ _instance = None
8
+ _lock = threading.Lock()
9
+
10
+ def __new__(cls):
11
+ if cls._instance is None:
12
+ with cls._lock:
13
+ if cls._instance is None:
14
+ cls._instance = super(ConnectionPool, cls).__new__(cls)
15
+ cls._instance.channels = {} # url -> channel
16
+ cls._instance.last_used = {} # url -> timestamp
17
+ return cls._instance
18
+
19
+ def get_channel(self, address: str):
20
+ """
21
+ Get an existing channel or create a new one.
22
+ address format: "ip:port"
23
+ """
24
+ # Normalize address (remove http:// if present)
25
+ if address.startswith("http://"):
26
+ address = address.replace("http://", "")
27
+ if address.startswith("https://"):
28
+ address = address.replace("https://", "")
29
+
30
+ with self._lock:
31
+ if address in self.channels:
32
+ # Check if channel is active (simplified check)
33
+ self.last_used[address] = time.time()
34
+ return self.channels[address]
35
+
36
+ # Create new channel for P2P network
37
+ # Fast keepalive to detect dead nodes quickly in decentralized network
38
+ # IMPORTANT: Increase message size for activation tensors in pipeline training!
39
+ MAX_MESSAGE_SIZE = 64 * 1024 * 1024 # 64MB for large batches/sequences
40
+ options = [
41
+ ('grpc.keepalive_time_ms', 30000), # Ping every 30 seconds
42
+ ('grpc.keepalive_timeout_ms', 10000), # 10 second timeout
43
+ ('grpc.keepalive_permit_without_calls', True), # Ping even when idle
44
+ ('grpc.http2.max_pings_without_data', 0), # Unlimited pings
45
+ ('grpc.max_receive_message_length', MAX_MESSAGE_SIZE), # For receiving responses
46
+ ('grpc.max_send_message_length', MAX_MESSAGE_SIZE), # For sending activations
47
+ ]
48
+ channel = grpc.insecure_channel(address, options=options)
49
+ self.channels[address] = channel
50
+ self.last_used[address] = time.time()
51
+ return channel
52
+
53
+ def cleanup(self, max_idle_seconds=300):
54
+ """Close channels idle for too long"""
55
+ now = time.time()
56
+ to_remove = []
57
+ with self._lock:
58
+ for addr, last_time in self.last_used.items():
59
+ if now - last_time > max_idle_seconds:
60
+ to_remove.append(addr)
61
+
62
+ for addr in to_remove:
63
+ print(f"Closing idle connection to {addr}")
64
+ self.channels[addr].close()
65
+ del self.channels[addr]
66
+ del self.last_used[addr]
67
+
68
+ # Global accessor
69
+ def get_channel(address: str):
70
+ pool = ConnectionPool()
71
+ return pool.get_channel(address)
72
+