joinmultiplayer 0.1.2__tar.gz → 0.1.4__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: joinmultiplayer
3
- Version: 0.1.2
3
+ Version: 0.1.4
4
4
  Summary: Join joinmultiplayer.ai — the agent-native 'ask the network'. Your Claude Code / Codex publishes what you can help with and answers questions from your own memory. No signup, no account, no credentials — runs locally.
5
5
  Author: Aiconic
6
6
  License: MIT
@@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta"
4
4
 
5
5
  [project]
6
6
  name = "joinmultiplayer"
7
- version = "0.1.2"
7
+ version = "0.1.4"
8
8
  description = "Join joinmultiplayer.ai — the agent-native 'ask the network'. Your Claude Code / Codex publishes what you can help with and answers questions from your own memory. No signup, no account, no credentials — runs locally."
9
9
  readme = "README.md"
10
10
  license = { text = "MIT" }
@@ -9,4 +9,4 @@ the machine; only the short labels. No signup, no account, no credentials.
9
9
  from .connector import main
10
10
 
11
11
  __all__ = ["main"]
12
- __version__ = "0.1.2"
12
+ __version__ = "0.1.4"
@@ -24,6 +24,23 @@ HISTORY = [Path.home() / ".claude" / "projects", Path.home() / ".codex"]
24
24
  # moves anything they'd rather keep friends-only. The public/friends split is a CHOICE, decided with the agent.
25
25
  _STOP =set("the and for that this with you your из для как что это под про или но не на по от до the a an of "
26
26
  "to in is are how do can what когда где почему мне мой если же бы то так вот они мы вы он она".split())
27
+ # Discourse-glue + tool/transcript scaffolding (RU possessives/modals/imperatives + EN connectives + CC/git tool
28
+ # keys). Stopping these COLLAPSES whole families of noise bigrams ("давай сделаем", "glob grep", "tool uses") before
29
+ # they ever form. Every token was adversarially checked against the real-memory preserve set — only "parameter"
30
+ # (kills PEFT) and "worktree" (legit DevOps topic) collided and were deliberately LEFT OUT. Extend via JM_STOP_EXTRA.
31
+ _STOP |= set((
32
+ "твой твоя твоё твое твоего твоей твоих твою твоим твоими твоём твоем свой своя своё свое своего своему своим "
33
+ "своими своих своей наш наша наше нашего нашему нашем нашей наших нашим нашими тебе тебя тобой меня мне мной вам "
34
+ "вас вами нам нас нами надо нужно нужен нужна нужны нужные можно хочешь хочется хочу должен должна должны давай "
35
+ "давайте сделаем сделай сделать сделаю плиз глянь глянем глянуть кажется можешь можете проверю проверим проверь "
36
+ "проверить посмотрю посмотрим посмотри зайду зайди погоди погнали покажи покажу думаю думаем думаешь делай готов "
37
+ "готова готово готовы короче вообще просто кстати конечно наверное видимо значит типа сейчас прямо потом после "
38
+ "сначала теперь пока уже сразу rather than else each other others most recent anything everything something "
39
+ "nothing someone anyone everyone then now just really actually basically maybe probably literally simply going "
40
+ "want wants lets done made makes gets keeps started toolu output-file local-command command-args command-name "
41
+ "command-message pretooluse posttooluse caveat subagent sub-agent task-notification task-id cwd stdin glob grep "
42
+ "webfetch websearch stdout stderr multiedit notebookedit commit push origin rebase stash checkout workflows "
43
+ + os.environ.get("JM_STOP_EXTRA", "")).split())
27
44
  # A candidate topic LABEL is never appropriate to PROPOSE if it names a credential, a client/company, revenue, or
28
45
  # personal contact — even when it appears in otherwise-public prose. Dropped from every distillation path. Generic
29
46
  # terms (creds/revenue/email) protect everyone; a few owner-specific stems (clients, internal hostnames) are
@@ -34,8 +51,55 @@ _LABEL_DENY = re.compile(
34
51
  r"\brelsy\b|getcourse|" # known clients
35
52
  r"aiconic|georgia|\bdeals?\b|outsource|revenue|\bmrr\b|\barr\b|invoice|оборот|выручк|" # business/private
36
53
  r"gmail|kustyuka|@|" # personal contact
37
- r"miracle|hydra"), # internal host names
54
+ r"miracle|hydra|" # internal host names
55
+ r"[0-9a-f]{8}-[0-9a-f]{4}|\b\d{5,}\b|\b[0-9a-f]{12,}\b|" # UUIDs / long numeric IDs / hex hashes = noise
56
+ # ── owner/teammate IDENTITY + filesystem paths (would PUBLISH a person/path on --register; FP cost ≈ 0) ──
57
+ r"-users-|desktop-llm|\byuka\w*|kust|\bкуст\w*|linkedin yuka|" # owner handle (incl. yuka2/vakust fragments)
58
+ r"\b(igor|vitalik|vitaly|vadim|evgeniy|evgeny|dima)\b|igor-brain|(?<![а-яё])(игор|витал|вадим|евген)[а-яё]*|"
59
+ r"\bдим[аыуой]\b|\bром[аыеу]\b|" # exact declensions (spare роман/видимость)
60
+ # ── infra/host/internal-product scaffolding that reads as a topic but isn't a routable human skill ──
61
+ r"claude-50\d|\bprivate claude\b|\bloopback\b|\blocalhost\b|\bport \d{2,5}\b|\bpinock\b|orange polska|"
62
+ r"joinmultiplayer|\bmultiplayer\w*|"
63
+ r"(?:^|\s)--?[a-z]|^\d{1,4}$"), # CLI flags / path slugs / bare year-or-port
38
64
  re.I)
65
+ # Model/tech stems that LOOK like high-entropy gibberish but are legit (qwen3-14b, rugpt3medium, 5bmodule, fpl8warsaw,
66
+ # gte-qwen2-1) — an allowlist guard so the structural noise predicates below CAN'T eat a real model name.
67
+ _MODEL_STEMS = re.compile(
68
+ r"qwen|llama|chatglm|rugpt|gpt|bge|gte|mistral|falcon|gemma|moe|flux|dora|lora|clip|bm25|fts|sha256|ed25519|"
69
+ r"win95|i18n|ipv4|p2p|era\d|v100|3090|4090|coder|embedding|instruct|turbo|module|warsaw|vibecoder|cosmos|turk|"
70
+ r"deepseek|phi|olmo|smol|kimi|eva|oss|safetensors", re.I)
71
+ _HEX_ALLOW = re.compile(r"ed25519|sha256|sha1\b|sha512|\bmd5\b|blake|crc32|base64|base32", re.I)
72
+
73
+
74
+ def _label_script(tok: str) -> str | None:
75
+ s = re.sub(r"[^a-zа-яё]", "", tok.lower())
76
+ hc = bool(re.search(r"[а-яё]", s)); hl = bool(re.search(r"[a-z]", s))
77
+ if hc and hl: return "mixed"
78
+ if hc: return "cyr"
79
+ if hl: return "lat"
80
+ return None
81
+
82
+
83
+ def _struct_noise(t: str) -> bool:
84
+ """Structural noise that can't be a flat regex alternation (needs allowlist-first guards / script comparison).
85
+ Returns True to DROP. Three rules, all empirically tuned against the real corpus + preserve set:
86
+ 1) high-entropy auth/room-token gibberish (wpzh715ay, yuka2671) — but spare model names via _MODEL_STEMS.
87
+ 2) clause-boundary cross-script bigram (Cyrillic word + Latin word), but NOT hyphen-compounds
88
+ (spares 'control-plane реестр', 'data-plane федеративный').
89
+ 3) short git-SHA / object-id hex run (7-11 chars), but spare crypto terms via _HEX_ALLOW."""
90
+ # 1) high-entropy single token
91
+ if " " not in t and "-" not in t and re.fullmatch(r"[a-z0-9]{7,}", t) \
92
+ and re.search(r"[0-9]", t) and re.search(r"[g-z]", t) and not _MODEL_STEMS.search(t):
93
+ return True
94
+ # 2) cross-script bare bigram
95
+ parts = t.split(" ")
96
+ if len(parts) == 2 and "-" not in parts[0] and "-" not in parts[1] \
97
+ and {_label_script(parts[0]), _label_script(parts[1])} == {"cyr", "lat"}:
98
+ return True
99
+ # 3) short SHA / object-id
100
+ if not _HEX_ALLOW.search(t) and re.search(r"\b[0-9a-f]{7,11}\b", t, re.I):
101
+ return True
102
+ return False
39
103
 
40
104
 
41
105
  def _read_history(max_chars: int = 1_500_000) -> str:
@@ -101,6 +165,9 @@ def _distill(text: str) -> list[str]:
101
165
  """Lexical topic SEED (on-device): frequent meaningful terms + domain bigrams. A crude starting hint only,
102
166
  with NO artificial cap — the AGENT is the real distiller (it reads the whole history and writes the
103
167
  comprehensive set; capturing ALL of what the person knows is the whole point)."""
168
+ text = re.sub(r"\[\[[^\]]*\]\]", " ", text) # [[memory-link]] slugs → not real topics
169
+ text = re.sub(r"\]\([^)]*\)", " ", text) # markdown ](target) link destinations
170
+ text = re.sub(r"\b[\w\-]+\.(?:md|json|jsonl|py|txt|html)\b", " ", text) # filenames (memory-index slugs etc.)
104
171
  words = [w for w in re.split(r"[^a-zа-я0-9\-]+", text.lower()) if len(w) > 3 and w not in _STOP]
105
172
  uni = Counter(words)
106
173
  bi = Counter(f"{a} {b}" for a, b in zip(words, words[1:]) if uni[a] > 5 and uni[b] > 5 and a != b)
@@ -113,8 +180,9 @@ def _distill(text: str) -> list[str]:
113
180
  for w, c in uni.most_common(): # every meaningful frequent unigram
114
181
  if w not in seen and c > 3:
115
182
  topics.append(w); seen.add(w)
116
- # drop sensitive candidate labels (credentials / clients / revenue / personal contact) never propose them
117
- return [t for t in topics if not _LABEL_DENY.search(t)]
183
+ # drop sensitive candidate labels (credentials / clients / revenue / personal contact) + structural noise
184
+ # (auth-token gibberish / cross-script clause fragments / short git-SHAs) — never propose them
185
+ return [t for t in topics if not _LABEL_DENY.search(t) and not _struct_noise(t)]
118
186
 
119
187
 
120
188
  # Narrow business/client/money/personal stems → DEFAULT to friends (so a lazy "go" lands on the SAFER split, not
@@ -128,6 +196,11 @@ _FRIENDS_DEFAULT = re.compile(
128
196
  re.I)
129
197
 
130
198
 
199
+ # the BARE proposal print (no agent in the loop) shows the top-N most-frequent labels, not all ~10k — a reviewable,
200
+ # non-overwhelming, less-noisy set. The --onboard path (where the agent is the real distiller) seeds from more. Env.
201
+ _PROPOSE_CAP = int(os.environ.get("JM_PROPOSE_CAP", "60"))
202
+
203
+
131
204
  def _propose(topics: list[str]) -> dict:
132
205
  """Conservative default split: obvious business/client/money/personal-shaped labels → FRIENDS, generic skills →
133
206
  PUBLIC, so a lazy "go" is SAFE (never max-exposure). The human sees both buckets and moves anything; the agent
@@ -297,6 +370,33 @@ def build_public_view() -> tuple[int, int]:
297
370
  return kept, excl
298
371
 
299
372
 
373
+ def _onboarding_text(max_chars: int = 2_000_000) -> tuple[str, str, int, int]:
374
+ """The text to distill onboarding TOPIC SEEDS from. PREFER curated, private-filtered memory notes (build_public_view
375
+ mirrors **/memory/*.md MINUS private globs): they're dense, deduped expertise AND structurally exclude private
376
+ files — whereas raw .jsonl session transcripts are ~3000× larger, so frequency-ranking over them surfaces CC
377
+ session-mechanics ("tool uses", "agent count", "duration usage") and can even surface sensitive tokens as
378
+ candidate labels. Fall back to raw history ONLY if there are no memory notes at all (so memory-less users still
379
+ work). Strips YAML frontmatter so 'type/metadata/name/description' keys don't become fake topics.
380
+ Returns (text, source, kept, excluded)."""
381
+ try:
382
+ kept, excl = build_public_view()
383
+ except Exception:
384
+ kept = excl = 0
385
+ parts = []
386
+ for p in sorted(PUBLIC_VIEW.glob("*.md")):
387
+ try:
388
+ t = p.read_text("utf-8", errors="ignore")
389
+ except Exception:
390
+ continue
391
+ t = re.sub(r"^\s*---\s*\n.*?\n---\s*\n", " ", t, count=1, flags=re.S) # YAML frontmatter block
392
+ t = re.sub(r"(?im)^\s*(name|description|metadata|type)\s*:.*$", " ", t) # stray frontmatter keys
393
+ parts.append(t)
394
+ text = "\n\n".join(parts)
395
+ if len(text) >= 200:
396
+ return text[:max_chars], "curated-memory", kept, excl
397
+ return _read_history(), "raw-history", kept, excl
398
+
399
+
300
400
  def _gather_public_context(question: str, budget: int = 80_000, per_file: int = 12_000) -> str:
301
401
  """Select the PUBLIC-only notes (already private-filtered into PUBLIC_VIEW) most relevant to the question, in
302
402
  PYTHON, and return them to INLINE into the prompt. The answerer model gets NO tools, so it can only ever see
@@ -740,11 +840,8 @@ def _onboard(a) -> None:
740
840
  if not (a.public or a.friends):
741
841
  # distill from the PRIVATE-FILTERED curated memory, NOT raw transcripts — .jsonl sessions are full of tool/
742
842
  # path noise and can surface sensitive tokens ("brain password", "basic auth") as candidate labels. Fall back
743
- # to raw history only if there are no memory notes at all.
744
- kept, excl = build_public_view()
745
- text = "\n\n".join(p.read_text("utf-8", errors="ignore") for p in sorted(PUBLIC_VIEW.glob("*.md")))
746
- if len(text) < 200:
747
- text = _read_history()
843
+ # to raw history only if there are no memory notes at all. (Shared with the bare path via _onboarding_text.)
844
+ text, _src, kept, excl = _onboarding_text()
748
845
  if len(text) < 200:
749
846
  print(json.dumps({"step": "propose", "topics": {"public": [], "friends": []},
750
847
  "note": "No local AI history found — nothing to distill. You can still --ask."},
@@ -854,6 +951,9 @@ def _onboard(a) -> None:
854
951
 
855
952
 
856
953
  def main() -> None:
954
+ for _s in (sys.stdout, sys.stderr): # Windows cp1252 console crashes on emoji/Cyrillic prints → force UTF-8
955
+ try: _s.reconfigure(encoding="utf-8", errors="replace")
956
+ except Exception: pass
857
957
  ap = argparse.ArgumentParser()
858
958
  ap.add_argument("--propose", action="store_true")
859
959
  ap.add_argument("--register", action="store_true")
@@ -940,7 +1040,7 @@ def main() -> None:
940
1040
  text = _read_chatgpt_export(a.import_chatgpt)
941
1041
  if len(text) < 200:
942
1042
  print(" couldn't read that ChatGPT export — point at conversations.json or the export .zip."); return
943
- split = _propose(_distill(text))
1043
+ split = _propose(_distill(text)[:_PROPOSE_CAP])
944
1044
  print(json.dumps({"proposed": split, "source": "chatgpt-export",
945
1045
  "rule": "≥10% public; nothing uploaded — distilled locally"}, ensure_ascii=False, indent=2))
946
1046
  print("\n register: python3 join.py --register --token <T> --public \"...\" --friends \"...\"")
@@ -1024,11 +1124,15 @@ def main() -> None:
1024
1124
  split = {"public": [x.strip() for x in a.public.split(",") if x.strip()],
1025
1125
  "friends": [x.strip() for x in a.friends.split(",") if x.strip()]}
1026
1126
  else:
1027
- text = _read_history()
1127
+ text, _src, _kept, _excl = _onboarding_text()
1028
1128
  if len(text) < 200:
1029
1129
  print(" no AI history found locally (Claude Code / Codex). Nothing to distill — you can still ASK.")
1030
1130
  return
1031
- split = _propose(_distill(text))
1131
+ all_topics = _distill(text)
1132
+ split = _propose(all_topics[:_PROPOSE_CAP])
1133
+ if len(all_topics) > _PROPOSE_CAP:
1134
+ print(f" (showing the top {_PROPOSE_CAP} of {len(all_topics)} distilled topics — edit freely before "
1135
+ f"--register; the --onboard flow lets your agent curate the full set)")
1032
1136
  print(json.dumps({"proposed": split, "rule": "≥10% public (give-to-get); raw history never leaves device"},
1033
1137
  ensure_ascii=False, indent=2))
1034
1138
  if a.register:
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: joinmultiplayer
3
- Version: 0.1.2
3
+ Version: 0.1.4
4
4
  Summary: Join joinmultiplayer.ai — the agent-native 'ask the network'. Your Claude Code / Codex publishes what you can help with and answers questions from your own memory. No signup, no account, no credentials — runs locally.
5
5
  Author: Aiconic
6
6
  License: MIT
File without changes