mllang-protocol 0.1.2__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,21 @@
1
+ Metadata-Version: 2.4
2
+ Name: mllang-protocol
3
+ Version: 0.1.2
4
+ Summary: MLLANG v0.1 reference parser — compact text-surface protocol for AI-agent state, lives inside markdown fenced blocks
5
+ Author: Jake Liu
6
+ License: Apache-2.0
7
+ Project-URL: Homepage, https://github.com/jakeliu/mllang
8
+ Project-URL: Repository, https://github.com/jakeliu/mllang
9
+ Project-URL: Documentation, https://mllang.com
10
+ Project-URL: Issues, https://github.com/jakeliu/mllang/issues
11
+ Keywords: mllang,ai,agents,multi-agent,llm,protocol
12
+ Classifier: Development Status :: 4 - Beta
13
+ Classifier: Intended Audience :: Developers
14
+ Classifier: License :: OSI Approved :: Apache Software License
15
+ Classifier: Programming Language :: Python :: 3
16
+ Classifier: Programming Language :: Python :: 3.9
17
+ Classifier: Programming Language :: Python :: 3.10
18
+ Classifier: Programming Language :: Python :: 3.11
19
+ Classifier: Programming Language :: Python :: 3.12
20
+ Classifier: Topic :: Software Development :: Libraries
21
+ Requires-Python: >=3.9
@@ -0,0 +1,40 @@
1
+ """MLLANG v0.1 reference parser — pure stdlib.
2
+
3
+ Usage:
4
+ from mllang import Packet, parse, compose, extract_from_markdown
5
+
6
+ p = parse("V:0.1.r1; I:demo; G:{task=test}; S:{x=1}; N:@K -> classify; H:<=>; P:0.85;")
7
+ print(p.next_agent) # @K
8
+ print(p.halt) # <=>
9
+ print(p.confidence) # 0.85
10
+
11
+ text = compose(p)
12
+ packets = extract_from_markdown(open("task.md").read())
13
+ """
14
+
15
+ from .packet import Packet, parse, compose, extract_from_markdown, validate
16
+ from .slots import SLOT_ORDER, REQUIRED_SLOTS
17
+ from .halt import HALT_ENUM
18
+ from .operators import OPERATORS
19
+ from .sanitize import sanitize, sanitize_to_json, VALID_LEVELS as TELEMETRY_LEVELS
20
+ from .embed import embed_in_markdown, extract_summary_and_packet
21
+
22
+ __version__ = "0.1.2"
23
+
24
+ __all__ = [
25
+ "Packet",
26
+ "parse",
27
+ "compose",
28
+ "extract_from_markdown",
29
+ "validate",
30
+ "sanitize",
31
+ "sanitize_to_json",
32
+ "TELEMETRY_LEVELS",
33
+ "embed_in_markdown",
34
+ "extract_summary_and_packet",
35
+ "SLOT_ORDER",
36
+ "REQUIRED_SLOTS",
37
+ "HALT_ENUM",
38
+ "OPERATORS",
39
+ "__version__",
40
+ ]
@@ -0,0 +1,91 @@
1
+ """Markdown-embed helpers for MLLANG packets.
2
+
3
+ Default pattern (summary mode):
4
+ short workflow summary + fenced ```mllang block.
5
+
6
+ Rationale: when the consumer is an AI, the packet already carries the
7
+ state, and the `EN:` shadow line is the human-skim summary. Long prose
8
+ duplicates the packet content for a human reader who is rarely going to
9
+ read it. Keep the surrounding text tight; let `EN:` do the dual-channel.
10
+
11
+ Opt-in `mode="verbose"` keeps a long-prose channel above the packet for
12
+ human-authored docs (PRs, issues, design notes) where readers do want
13
+ the narrative.
14
+ """
15
+
16
+ from __future__ import annotations
17
+
18
+ from typing import Optional, Union
19
+
20
+ from .packet import Packet, compose, parse
21
+
22
+ VALID_MODES = {"summary", "verbose", "packet_only"}
23
+ DEFAULT_FENCE = "mllang"
24
+
25
+
26
+ def embed_in_markdown(
27
+ packet: Union[Packet, str],
28
+ summary: str = "",
29
+ prose: Optional[str] = None,
30
+ mode: str = "summary",
31
+ title: Optional[str] = None,
32
+ fence: str = DEFAULT_FENCE,
33
+ ) -> str:
34
+ """Return markdown string containing the packet in a fenced block.
35
+
36
+ Args:
37
+ packet: Packet object or MLLANG text.
38
+ summary: 1-3 line workflow summary used in `mode="summary"`.
39
+ prose: long-form human-readable text used in `mode="verbose"`.
40
+ mode: "summary" (default), "verbose", or "packet_only".
41
+ title: optional `# Heading` line.
42
+ fence: code-fence language tag (default "mllang").
43
+
44
+ Returns:
45
+ Markdown text with the packet embedded.
46
+ """
47
+ if mode not in VALID_MODES:
48
+ raise ValueError(f"mode must be one of {sorted(VALID_MODES)}; got {mode!r}")
49
+
50
+ if isinstance(packet, Packet):
51
+ packet_text = compose(packet)
52
+ else:
53
+ packet_text = packet.strip()
54
+
55
+ parts: list[str] = []
56
+ if title:
57
+ parts.append(f"# {title}")
58
+ if mode == "summary" and summary:
59
+ parts.append(summary.strip())
60
+ elif mode == "verbose" and prose:
61
+ parts.append(prose.strip())
62
+ parts.append(f"```{fence}\n{packet_text}\n```")
63
+ return "\n\n".join(parts) + "\n"
64
+
65
+
66
+ def extract_summary_and_packet(md_text: str, fence: str = DEFAULT_FENCE) -> tuple[str, Optional[Packet]]:
67
+ """Return (text_before_first_block, parsed_packet | None).
68
+
69
+ Useful when a downstream agent wants both the human summary line(s)
70
+ and the structured packet from a summary-mode markdown file.
71
+ """
72
+ import re
73
+
74
+ block_re = re.compile(rf"```{re.escape(fence)}\s*\n(.*?)\n```", re.DOTALL)
75
+ m = block_re.search(md_text)
76
+ if not m:
77
+ return md_text.strip(), None
78
+ before = md_text[: m.start()].strip()
79
+ # Strip leading "# title" line if present, keep the rest as summary
80
+ if before.startswith("#"):
81
+ lines = before.splitlines()
82
+ # Drop heading lines from the top until first non-heading
83
+ i = 0
84
+ while i < len(lines) and (lines[i].startswith("#") or not lines[i].strip()):
85
+ i += 1
86
+ before = "\n".join(lines[i:]).strip()
87
+ try:
88
+ p = parse(m.group(1))
89
+ except ValueError:
90
+ p = None
91
+ return before, p
@@ -0,0 +1,39 @@
1
+ """MLLANG halt enum (9-way)."""
2
+
3
+ import re
4
+
5
+ # 9-way halt enum
6
+ HALT_ENUM = {
7
+ "accept",
8
+ "repair",
9
+ "regen",
10
+ "escalate@H",
11
+ "test=pass",
12
+ "test=fail",
13
+ "risk!high",
14
+ "<=>",
15
+ }
16
+
17
+ # after-N-rounds is a pattern, not a literal — handled separately
18
+ _AFTER_N_ROUNDS_RE = re.compile(r"^after-\d+-rounds$|^after-N-rounds$")
19
+
20
+
21
+ def is_valid_halt(halt_str: str) -> bool:
22
+ """Check if a halt value (or pipe-separated multi-value) is valid."""
23
+ if not halt_str:
24
+ return False
25
+ parts = [p.strip() for p in halt_str.split("|")]
26
+ for part in parts:
27
+ if part in HALT_ENUM:
28
+ continue
29
+ if _AFTER_N_ROUNDS_RE.match(part):
30
+ continue
31
+ return False
32
+ return True
33
+
34
+
35
+ def halt_categories(halt_str: str) -> list:
36
+ """Return list of halt categories in this halt string (after splitting by |)."""
37
+ if not halt_str:
38
+ return []
39
+ return [p.strip() for p in halt_str.split("|")]
@@ -0,0 +1,29 @@
1
+ """MLLANG operator definitions."""
2
+
3
+ # 19 operators (full set from v0.1 spec section 3)
4
+ OPERATORS = {
5
+ "=": "is / equals",
6
+ ":=": "assign",
7
+ "==": "confirmed equal (verified)",
8
+ "?": "unknown / open",
9
+ "!": "assertion / must",
10
+ "*": "important / pinned",
11
+ "~": "approximate / loose",
12
+ "^": "parent / prior round (e.g. ^r1)",
13
+ "->": "leads to / next step",
14
+ "=>": "implies / therefore",
15
+ "<=>": "agreed by all parties (halt value)",
16
+ "&": "and",
17
+ "|": "or",
18
+ "^!": "reserved compound (never appears bare)",
19
+ "#": "tag / topic",
20
+ "$": "tool-invocation shorthand (inside Y: slot only)",
21
+ "[]": "list / set",
22
+ "{}": "map / struct",
23
+ "()": "group",
24
+ ";": "slot separator",
25
+ ",": "item separator",
26
+ }
27
+
28
+ # Agent codes
29
+ AGENT_CODES = {"@C", "@X", "@K", "@G", "@M", "@H", "@?"}
@@ -0,0 +1,361 @@
1
+ """MLLANG Packet parser/composer/validator.
2
+
3
+ Reference implementation, pure stdlib, ~200 LOC core.
4
+ """
5
+
6
+ from __future__ import annotations
7
+
8
+ import re
9
+ from dataclasses import dataclass, field
10
+ from typing import Optional, Union, List, Dict
11
+
12
+ from .slots import SLOT_ORDER, REQUIRED_SLOTS, MAP_SLOTS, LIST_SLOTS
13
+ from .halt import is_valid_halt
14
+ from .operators import AGENT_CODES
15
+
16
+
17
+ # ── Regex helpers ───────────────────────────────────────────────────────
18
+
19
+
20
+ # Match a fenced ```mllang block in markdown
21
+ _MARKDOWN_FENCE_RE = re.compile(r"```mllang\s*\n(.*?)\n```", re.DOTALL)
22
+
23
+ # Match a slot: KEY:value;
24
+ # Slot value continues until next ; that's not inside brackets/braces/quotes
25
+ _SLOT_KEY_RE = re.compile(r"(?:^|;)\s*([A-Z]):\s*", re.MULTILINE)
26
+
27
+ # Match version: V:MAJOR.MINOR.rROUND
28
+ _VERSION_RE = re.compile(r"^\s*(\d+)\.(\d+)\.r(\d+)\s*$")
29
+
30
+ # Match next agent: @C -> verb
31
+ _NEXT_AGENT_RE = re.compile(r"@([CXKGMH?])\s*->\s*(\S+)")
32
+
33
+
34
+ # ── Packet dataclass ────────────────────────────────────────────────────
35
+
36
+
37
+ @dataclass
38
+ class Packet:
39
+ """An MLLANG v0.1 packet."""
40
+
41
+ # Required slots
42
+ version: str = "" # V: "0.1.r1"
43
+ goal: Dict[str, str] = field(default_factory=dict) # G:
44
+ state: Dict[str, str] = field(default_factory=dict) # S:
45
+ next_agent: str = "" # N: "@K -> verb" (full string)
46
+ halt: str = "" # H: "<=>" or "test=pass | after-3-rounds"
47
+
48
+ # Optional slots
49
+ thread_id: str = "" # I:
50
+ decisions: List[str] = field(default_factory=list) # D:
51
+ evidence: List[str] = field(default_factory=list) # E:
52
+ unknowns: List[str] = field(default_factory=list) # U:
53
+ risks: List[str] = field(default_factory=list) # R:
54
+ test: Dict[str, str] = field(default_factory=dict) # T:
55
+ files: List[str] = field(default_factory=list) # F:
56
+ tool_calls: List[str] = field(default_factory=list) # Y:
57
+ budget: Dict[str, str] = field(default_factory=dict) # B:
58
+ confidence: float = 0.0 # P:
59
+ assumptions: List[str] = field(default_factory=list) # A:
60
+
61
+ # Dual-channel EN: shadow line (not a slot, but tracked)
62
+ en_shadow: str = ""
63
+
64
+ @classmethod
65
+ def parse(cls, text: str) -> "Packet":
66
+ """Parse MLLANG packet text into Packet object."""
67
+ return parse(text)
68
+
69
+ def compose(self) -> str:
70
+ """Serialize Packet to MLLANG text."""
71
+ return compose(self)
72
+
73
+ def validate(self) -> List[str]:
74
+ """Return list of validation errors. Empty = valid."""
75
+ return validate(self)
76
+
77
+ @property
78
+ def next_agent_code(self) -> str:
79
+ """Return just the @X part of N: slot."""
80
+ m = _NEXT_AGENT_RE.search(self.next_agent)
81
+ return f"@{m.group(1)}" if m else ""
82
+
83
+ @property
84
+ def next_agent_verb(self) -> str:
85
+ """Return just the verb part of N: slot."""
86
+ m = _NEXT_AGENT_RE.search(self.next_agent)
87
+ return m.group(2) if m else ""
88
+
89
+ def __str__(self) -> str:
90
+ return self.compose()
91
+
92
+
93
+ # ── Parser ──────────────────────────────────────────────────────────────
94
+
95
+
96
+ def _strip_en_shadow(text: str) -> tuple[str, str]:
97
+ """Strip EN: shadow line from end, return (packet_text, en_text)."""
98
+ lines = text.strip().splitlines()
99
+ en = ""
100
+ packet_lines = []
101
+ for line in lines:
102
+ if line.startswith("EN:"):
103
+ en = line[len("EN:"):].strip()
104
+ else:
105
+ packet_lines.append(line)
106
+ return " ".join(packet_lines).strip(), en
107
+
108
+
109
+ def _split_slots(packet_text: str) -> List[tuple[str, str]]:
110
+ """Split packet into [(slot_key, value), ...]. Respects nested brackets/braces/quotes."""
111
+ slots = []
112
+ i = 0
113
+ text = packet_text.strip()
114
+ while i < len(text):
115
+ # Skip leading whitespace + semicolons
116
+ while i < len(text) and text[i] in " \t;\n":
117
+ i += 1
118
+ if i >= len(text):
119
+ break
120
+
121
+ # Expect a slot key (single uppercase letter followed by :)
122
+ if i + 1 < len(text) and text[i].isupper() and text[i + 1] == ":":
123
+ key = text[i]
124
+ i += 2 # skip "K:"
125
+
126
+ # Read value until top-level ;
127
+ depth = 0
128
+ in_quote = False
129
+ start = i
130
+ while i < len(text):
131
+ c = text[i]
132
+ if c == '"' and (i == 0 or text[i - 1] != "\\"):
133
+ in_quote = not in_quote
134
+ elif not in_quote:
135
+ if c in "[{(":
136
+ depth += 1
137
+ elif c in "]})":
138
+ depth -= 1
139
+ elif c == ";" and depth == 0:
140
+ break
141
+ i += 1
142
+
143
+ value = text[start:i].strip()
144
+ slots.append((key, value))
145
+ else:
146
+ # Unknown character — skip and continue
147
+ i += 1
148
+ return slots
149
+
150
+
151
+ def _parse_map(value: str) -> Dict[str, str]:
152
+ """Parse {k=v, k=v} into dict."""
153
+ value = value.strip()
154
+ if value.startswith("{") and value.endswith("}"):
155
+ value = value[1:-1].strip()
156
+ if not value:
157
+ return {}
158
+ out = {}
159
+ parts = _split_top_level(value, ",")
160
+ for part in parts:
161
+ part = part.strip()
162
+ if "=" in part:
163
+ k, v = part.split("=", 1)
164
+ out[k.strip()] = v.strip()
165
+ elif part:
166
+ out[part] = ""
167
+ return out
168
+
169
+
170
+ def _parse_list(value: str) -> List[str]:
171
+ """Parse [a, b, c] into list."""
172
+ value = value.strip()
173
+ if value.startswith("[") and value.endswith("]"):
174
+ value = value[1:-1].strip()
175
+ if not value:
176
+ return []
177
+ return [p.strip() for p in _split_top_level(value, ",")]
178
+
179
+
180
+ def _split_top_level(text: str, sep: str) -> List[str]:
181
+ """Split by `sep` at top level (respects nested brackets/braces/quotes)."""
182
+ out = []
183
+ depth = 0
184
+ in_quote = False
185
+ cur = []
186
+ for i, c in enumerate(text):
187
+ if c == '"' and (i == 0 or text[i - 1] != "\\"):
188
+ in_quote = not in_quote
189
+ cur.append(c)
190
+ elif in_quote:
191
+ cur.append(c)
192
+ elif c in "[{(":
193
+ depth += 1
194
+ cur.append(c)
195
+ elif c in "]})":
196
+ depth -= 1
197
+ cur.append(c)
198
+ elif c == sep and depth == 0:
199
+ out.append("".join(cur))
200
+ cur = []
201
+ else:
202
+ cur.append(c)
203
+ if cur:
204
+ out.append("".join(cur))
205
+ return out
206
+
207
+
208
+ def parse(text: str) -> Packet:
209
+ """Parse MLLANG packet text into Packet object.
210
+
211
+ Tolerates optional EN: shadow line.
212
+ """
213
+ if not text or not text.strip():
214
+ raise ValueError("Empty packet")
215
+
216
+ packet_text, en = _strip_en_shadow(text)
217
+ slots = _split_slots(packet_text)
218
+
219
+ p = Packet()
220
+ p.en_shadow = en
221
+
222
+ for key, value in slots:
223
+ if key == "V":
224
+ p.version = value.strip()
225
+ elif key == "I":
226
+ p.thread_id = value.strip()
227
+ elif key == "G":
228
+ p.goal = _parse_map(value)
229
+ elif key == "S":
230
+ p.state = _parse_map(value)
231
+ elif key == "D":
232
+ p.decisions = _parse_list(value)
233
+ elif key == "E":
234
+ p.evidence = _parse_list(value)
235
+ elif key == "U":
236
+ p.unknowns = _parse_list(value)
237
+ elif key == "R":
238
+ p.risks = _parse_list(value)
239
+ elif key == "T":
240
+ p.test = _parse_map(value)
241
+ elif key == "F":
242
+ p.files = _parse_list(value)
243
+ elif key == "Y":
244
+ p.tool_calls = _parse_list(value)
245
+ elif key == "B":
246
+ p.budget = _parse_map(value)
247
+ elif key == "N":
248
+ p.next_agent = value.strip()
249
+ elif key == "H":
250
+ p.halt = value.strip()
251
+ elif key == "P":
252
+ try:
253
+ p.confidence = float(value.strip())
254
+ except ValueError:
255
+ pass
256
+ elif key == "A":
257
+ p.assumptions = _parse_list(value)
258
+
259
+ return p
260
+
261
+
262
+ def compose(p: Packet) -> str:
263
+ """Serialize Packet to MLLANG text (single line + EN: line if present)."""
264
+ parts = []
265
+
266
+ def fmt_map(m: Dict[str, str]) -> str:
267
+ return "{" + ", ".join(f"{k}={v}" for k, v in m.items()) + "}"
268
+
269
+ def fmt_list(l: List[str]) -> str:
270
+ return "[" + ", ".join(l) + "]"
271
+
272
+ if p.version:
273
+ parts.append(f"V:{p.version}")
274
+ if p.thread_id:
275
+ parts.append(f"I:{p.thread_id}")
276
+ if p.goal:
277
+ parts.append(f"G:{fmt_map(p.goal)}")
278
+ if p.state:
279
+ parts.append(f"S:{fmt_map(p.state)}")
280
+ if p.decisions:
281
+ parts.append(f"D:{fmt_list(p.decisions)}")
282
+ if p.evidence:
283
+ parts.append(f"E:{fmt_list(p.evidence)}")
284
+ if p.unknowns:
285
+ parts.append(f"U:{fmt_list(p.unknowns)}")
286
+ if p.risks:
287
+ parts.append(f"R:{fmt_list(p.risks)}")
288
+ if p.test:
289
+ parts.append(f"T:{fmt_map(p.test)}")
290
+ if p.files:
291
+ parts.append(f"F:{fmt_list(p.files)}")
292
+ if p.tool_calls:
293
+ parts.append(f"Y:{fmt_list(p.tool_calls)}")
294
+ if p.budget:
295
+ parts.append(f"B:{fmt_map(p.budget)}")
296
+ if p.next_agent:
297
+ parts.append(f"N:{p.next_agent}")
298
+ if p.halt:
299
+ parts.append(f"H:{p.halt}")
300
+ if p.confidence > 0:
301
+ parts.append(f"P:{p.confidence:.2f}")
302
+ if p.assumptions:
303
+ parts.append(f"A:{fmt_list(p.assumptions)}")
304
+
305
+ body = "; ".join(parts) + ";"
306
+ if p.en_shadow:
307
+ body += f"\nEN: {p.en_shadow}"
308
+ return body
309
+
310
+
311
+ def validate(p: Packet) -> List[str]:
312
+ """Return list of validation errors. Empty = valid."""
313
+ errors = []
314
+
315
+ # Required slots
316
+ if not p.version:
317
+ errors.append("missing required slot V (version)")
318
+ elif not _VERSION_RE.match(p.version):
319
+ errors.append(f"V slot must be MAJOR.MINOR.rROUND format, got: {p.version!r}")
320
+
321
+ if not p.goal:
322
+ errors.append("missing required slot G (goal)")
323
+
324
+ if not p.state:
325
+ errors.append("missing required slot S (state)")
326
+
327
+ if not p.next_agent:
328
+ errors.append("missing required slot N (next agent)")
329
+ elif not _NEXT_AGENT_RE.search(p.next_agent):
330
+ errors.append(f"N slot must be '@<agent> -> <verb>' format, got: {p.next_agent!r}")
331
+
332
+ if not p.halt:
333
+ errors.append("missing required slot H (halt)")
334
+ elif not is_valid_halt(p.halt):
335
+ errors.append(f"H slot has invalid value(s): {p.halt!r}")
336
+
337
+ # Confidence range
338
+ if p.confidence and not (0.0 <= p.confidence <= 1.0):
339
+ errors.append(f"P slot must be 0.00-1.00, got: {p.confidence}")
340
+
341
+ # Agent code validation
342
+ if p.next_agent:
343
+ m = _NEXT_AGENT_RE.search(p.next_agent)
344
+ if m:
345
+ agent = f"@{m.group(1)}"
346
+ if agent not in AGENT_CODES:
347
+ errors.append(f"unknown agent code in N: {agent}")
348
+
349
+ return errors
350
+
351
+
352
+ def extract_from_markdown(md_text: str) -> List[Packet]:
353
+ """Find all fenced ```mllang blocks in markdown, parse each into Packet."""
354
+ matches = _MARKDOWN_FENCE_RE.findall(md_text)
355
+ out = []
356
+ for m in matches:
357
+ try:
358
+ out.append(parse(m))
359
+ except ValueError:
360
+ continue
361
+ return out
@@ -0,0 +1,270 @@
1
+ """MLLANG telemetry sanitization.
2
+
3
+ Strips IP from packets BEFORE telemetry leaves user's machine. Slot
4
+ SHAPES are public; slot VALUES stay private.
5
+
6
+ Public API:
7
+ sanitize(packet, level=None, reject_leaks=True) -> dict | None
8
+
9
+ Levels:
10
+ off — return None (nothing sent). Default.
11
+ shape — slot presence, halt, confidence, next-agent code, operators.
12
+ structured — shape + map keys (no values) + verb names + counts.
13
+ full — structured + redacted map values + assumption prefixes.
14
+
15
+ Set via MLLANG_TELEMETRY env var or explicit level=... kwarg.
16
+ Per-packet override always wins over env var.
17
+
18
+ Defense in depth: by default, sanitize() refuses to ship a payload that
19
+ still matches any leak detector (email/path/api-key/long-quote) and
20
+ returns None instead.
21
+ """
22
+
23
+ from __future__ import annotations
24
+
25
+ import hashlib
26
+ import json
27
+ import os
28
+ import re
29
+ from typing import Any, Dict, List, Optional, Union
30
+
31
+ from .halt import halt_categories
32
+ from .operators import OPERATORS
33
+ from .packet import Packet, compose, parse
34
+
35
+ VALID_LEVELS = {"off", "shape", "structured", "full"}
36
+ DEFAULT_LEVEL = "off"
37
+
38
+ # Operators excluded from telemetry: structural / too common to be a signal.
39
+ _TRIVIAL_OPS = {";", ",", "=", "[]", "{}", "()"}
40
+
41
+ # Leak detectors — last-line defense before payload ships.
42
+ _EMAIL_RE = re.compile(r"\b[\w.+-]+@[\w-]+\.[\w.-]+\b")
43
+ _PATH_RE = re.compile(r"(?:/[A-Za-z0-9._-]+){2,}")
44
+ _API_KEY_RE = re.compile(
45
+ r"\b(?:sk|pk|api|key|token|secret|bearer)[-_=][A-Za-z0-9_-]{16,}",
46
+ re.IGNORECASE,
47
+ )
48
+ _LONG_QUOTE_RE = re.compile(r'"[^"]{60,}"')
49
+
50
+ _LEAK_DETECTORS = [
51
+ ("email", _EMAIL_RE),
52
+ ("path", _PATH_RE),
53
+ ("api_key", _API_KEY_RE),
54
+ ("long_quote", _LONG_QUOTE_RE),
55
+ ]
56
+
57
+ _TOOL_VERB_RE = re.compile(r"\$?([A-Za-z_][A-Za-z0-9_]*)")
58
+ _AGENT_CODE_RE = re.compile(r"@[CXKGMH?]")
59
+
60
+
61
+ def _level_from_env(explicit: Optional[str]) -> str:
62
+ if explicit is not None:
63
+ level = explicit
64
+ else:
65
+ level = os.environ.get("MLLANG_TELEMETRY", DEFAULT_LEVEL).lower()
66
+ if level not in VALID_LEVELS:
67
+ level = DEFAULT_LEVEL
68
+ return level
69
+
70
+
71
+ def _hash_thread_id(thread_id: str) -> str:
72
+ if not thread_id:
73
+ return ""
74
+ digest = hashlib.sha256(thread_id.encode("utf-8")).hexdigest()[:12]
75
+ return f"<I:hash:{digest}>"
76
+
77
+
78
+ def _detect_operators(packet_text: str) -> List[str]:
79
+ """Return operators present in raw text. Multi-char first to avoid shadow."""
80
+ found: List[str] = []
81
+ seen: set = set()
82
+ ops_sorted = sorted(OPERATORS.keys(), key=lambda o: -len(o))
83
+ for op in ops_sorted:
84
+ if op in _TRIVIAL_OPS or op in seen:
85
+ continue
86
+ if op in packet_text:
87
+ found.append(op)
88
+ seen.add(op)
89
+ return found
90
+
91
+
92
+ def _redact_map(m: Dict[str, str]) -> Dict[str, str]:
93
+ return {k: f"<REDACTED:{len(v)}-chars>" for k, v in m.items()}
94
+
95
+
96
+ def _verb_names(tool_calls: List[str]) -> List[str]:
97
+ """Verb name from $verb(args). Args NEVER returned."""
98
+ out: List[str] = []
99
+ for call in tool_calls:
100
+ m = _TOOL_VERB_RE.search(call)
101
+ if m:
102
+ out.append(m.group(1))
103
+ return out
104
+
105
+
106
+ def _agent_code(next_agent: str) -> str:
107
+ m = _AGENT_CODE_RE.search(next_agent or "")
108
+ return m.group(0) if m else ""
109
+
110
+
111
+ def _slots_present(p: Packet) -> List[str]:
112
+ present: List[str] = []
113
+ if p.version:
114
+ present.append("V")
115
+ if p.thread_id:
116
+ present.append("I")
117
+ if p.goal:
118
+ present.append("G")
119
+ if p.state:
120
+ present.append("S")
121
+ if p.decisions:
122
+ present.append("D")
123
+ if p.evidence:
124
+ present.append("E")
125
+ if p.unknowns:
126
+ present.append("U")
127
+ if p.risks:
128
+ present.append("R")
129
+ if p.test:
130
+ present.append("T")
131
+ if p.files:
132
+ present.append("F")
133
+ if p.tool_calls:
134
+ present.append("Y")
135
+ if p.budget:
136
+ present.append("B")
137
+ if p.next_agent:
138
+ present.append("N")
139
+ if p.halt:
140
+ present.append("H")
141
+ if p.confidence:
142
+ present.append("P")
143
+ if p.assumptions:
144
+ present.append("A")
145
+ return present
146
+
147
+
148
+ def _walk_strings(value: Any):
149
+ if isinstance(value, str):
150
+ yield value
151
+ elif isinstance(value, dict):
152
+ for k, v in value.items():
153
+ yield k
154
+ yield from _walk_strings(v)
155
+ elif isinstance(value, list):
156
+ for item in value:
157
+ yield from _walk_strings(item)
158
+
159
+
160
+ def _detect_leaks(payload: Dict[str, Any]) -> List[str]:
161
+ """Scan payload for residual IP markers. Returns hit list."""
162
+ leaks: List[str] = []
163
+ for key, value in payload.items():
164
+ if key in ("thread_hash",):
165
+ continue # hash output deliberately matches no detector
166
+ for text in _walk_strings(value):
167
+ for name, rx in _LEAK_DETECTORS:
168
+ if rx.search(text):
169
+ leaks.append(f"{key}:{name}")
170
+ break
171
+ return leaks
172
+
173
+
174
+ def sanitize(
175
+ packet: Union[Packet, str],
176
+ level: Optional[str] = None,
177
+ reject_leaks: bool = True,
178
+ ) -> Optional[Dict[str, Any]]:
179
+ """Sanitize an MLLANG packet for telemetry.
180
+
181
+ Args:
182
+ packet: a Packet object or raw MLLANG text.
183
+ level: "off" | "shape" | "structured" | "full".
184
+ None = read MLLANG_TELEMETRY env var (default "off").
185
+ reject_leaks: when True (default), return None instead of a payload
186
+ whose values still match a leak detector.
187
+
188
+ Returns:
189
+ dict telemetry payload, or None if level=="off" / leak detected /
190
+ packet unparseable.
191
+ """
192
+ resolved = _level_from_env(level)
193
+ if resolved == "off":
194
+ return None
195
+
196
+ if isinstance(packet, Packet):
197
+ p = packet
198
+ raw_text = compose(p)
199
+ else:
200
+ try:
201
+ p = parse(packet)
202
+ except ValueError:
203
+ return None
204
+ raw_text = packet
205
+
206
+ if not p.version:
207
+ return None
208
+
209
+ if reject_leaks:
210
+ for _name, rx in _LEAK_DETECTORS:
211
+ if rx.search(raw_text):
212
+ return None
213
+
214
+ payload: Dict[str, Any] = {
215
+ "v": p.version,
216
+ "slots_present": _slots_present(p),
217
+ "operators_used": _detect_operators(raw_text),
218
+ "halt": p.halt,
219
+ "halt_categories": halt_categories(p.halt),
220
+ "confidence": p.confidence,
221
+ "next_agent": _agent_code(p.next_agent),
222
+ "thread_hash": _hash_thread_id(p.thread_id),
223
+ "level": resolved,
224
+ }
225
+
226
+ if resolved in ("structured", "full"):
227
+ payload.update(
228
+ {
229
+ "goal_keys": list(p.goal.keys()),
230
+ "state_keys": list(p.state.keys()),
231
+ "test_results": {
232
+ k: v for k, v in p.test.items() if v in ("pass", "fail")
233
+ },
234
+ "tool_verbs": _verb_names(p.tool_calls),
235
+ "decisions_count": len(p.decisions),
236
+ "evidence_count": len(p.evidence),
237
+ "unknowns_count": len(p.unknowns),
238
+ "risks_count": len(p.risks),
239
+ "files_count": len(p.files),
240
+ "tool_calls_count": len(p.tool_calls),
241
+ "assumptions_count": len(p.assumptions),
242
+ "en_shadow_length": len(p.en_shadow),
243
+ }
244
+ )
245
+
246
+ if resolved == "full":
247
+ payload.update(
248
+ {
249
+ "goal_values_redacted": _redact_map(p.goal),
250
+ "state_values_redacted": _redact_map(p.state),
251
+ "assumptions_prefix": [a[:20] for a in p.assumptions],
252
+ }
253
+ )
254
+
255
+ if reject_leaks:
256
+ leaks = _detect_leaks(payload)
257
+ if leaks:
258
+ return None
259
+
260
+ return payload
261
+
262
+
263
+ def sanitize_to_json(
264
+ packet: Union[Packet, str],
265
+ level: Optional[str] = None,
266
+ reject_leaks: bool = True,
267
+ ) -> Optional[str]:
268
+ """sanitize() + JSON encode. None when payload would be empty / rejected."""
269
+ out = sanitize(packet, level=level, reject_leaks=reject_leaks)
270
+ return json.dumps(out, separators=(",", ":")) if out is not None else None
@@ -0,0 +1,36 @@
1
+ """MLLANG slot definitions."""
2
+
3
+ # Canonical slot order (16 slots)
4
+ SLOT_ORDER = ["V", "I", "G", "S", "D", "E", "U", "R", "T", "F", "Y", "B", "N", "H", "P", "A"]
5
+
6
+ # Required slots (must be present for valid packet)
7
+ REQUIRED_SLOTS = ["V", "G", "S", "N", "H"]
8
+
9
+ # Slot meanings
10
+ SLOT_DESCRIPTIONS = {
11
+ "V": "version + round (e.g. V:0.1.r1)",
12
+ "I": "thread-id",
13
+ "G": "goal-map {key=value, ...}",
14
+ "S": "state-map {key=value, ...}",
15
+ "D": "decisions [item, item, ...]",
16
+ "E": "evidence [path, citation, ...]",
17
+ "U": "unknowns [?question, ...]",
18
+ "R": "risks [risk, ...]",
19
+ "T": "test result {check=pass|fail, ...}",
20
+ "F": "files [path, ...]",
21
+ "Y": "tool-calls [$verb(args), ...]",
22
+ "B": "budget cap {tokens=N, time=Ns, money=N}",
23
+ "N": "next @agent -> verb",
24
+ "H": "halt (see HALT_ENUM)",
25
+ "P": "confidence float 0.00-1.00",
26
+ "A": "assumptions [item, ...]",
27
+ }
28
+
29
+ # Slots that contain maps (key=value)
30
+ MAP_SLOTS = {"G", "S", "T", "B"}
31
+
32
+ # Slots that contain lists
33
+ LIST_SLOTS = {"D", "E", "U", "R", "F", "Y", "A"}
34
+
35
+ # Slots with scalar values
36
+ SCALAR_SLOTS = {"V", "I", "N", "H", "P"}
@@ -0,0 +1,21 @@
1
+ Metadata-Version: 2.4
2
+ Name: mllang-protocol
3
+ Version: 0.1.2
4
+ Summary: MLLANG v0.1 reference parser — compact text-surface protocol for AI-agent state, lives inside markdown fenced blocks
5
+ Author: Jake Liu
6
+ License: Apache-2.0
7
+ Project-URL: Homepage, https://github.com/jakeliu/mllang
8
+ Project-URL: Repository, https://github.com/jakeliu/mllang
9
+ Project-URL: Documentation, https://mllang.com
10
+ Project-URL: Issues, https://github.com/jakeliu/mllang/issues
11
+ Keywords: mllang,ai,agents,multi-agent,llm,protocol
12
+ Classifier: Development Status :: 4 - Beta
13
+ Classifier: Intended Audience :: Developers
14
+ Classifier: License :: OSI Approved :: Apache Software License
15
+ Classifier: Programming Language :: Python :: 3
16
+ Classifier: Programming Language :: Python :: 3.9
17
+ Classifier: Programming Language :: Python :: 3.10
18
+ Classifier: Programming Language :: Python :: 3.11
19
+ Classifier: Programming Language :: Python :: 3.12
20
+ Classifier: Topic :: Software Development :: Libraries
21
+ Requires-Python: >=3.9
@@ -0,0 +1,12 @@
1
+ pyproject.toml
2
+ mllang/__init__.py
3
+ mllang/embed.py
4
+ mllang/halt.py
5
+ mllang/operators.py
6
+ mllang/packet.py
7
+ mllang/sanitize.py
8
+ mllang/slots.py
9
+ mllang_protocol.egg-info/PKG-INFO
10
+ mllang_protocol.egg-info/SOURCES.txt
11
+ mllang_protocol.egg-info/dependency_links.txt
12
+ mllang_protocol.egg-info/top_level.txt
@@ -0,0 +1,35 @@
1
+ [build-system]
2
+ requires = ["setuptools>=61.0", "wheel"]
3
+ build-backend = "setuptools.build_meta"
4
+
5
+ [project]
6
+ name = "mllang-protocol"
7
+ version = "0.1.2"
8
+ description = "MLLANG v0.1 reference parser — compact text-surface protocol for AI-agent state, lives inside markdown fenced blocks"
9
+ license = {text = "Apache-2.0"}
10
+ requires-python = ">=3.9"
11
+ authors = [
12
+ {name = "Jake Liu"}
13
+ ]
14
+ keywords = ["mllang", "ai", "agents", "multi-agent", "llm", "protocol"]
15
+ classifiers = [
16
+ "Development Status :: 4 - Beta",
17
+ "Intended Audience :: Developers",
18
+ "License :: OSI Approved :: Apache Software License",
19
+ "Programming Language :: Python :: 3",
20
+ "Programming Language :: Python :: 3.9",
21
+ "Programming Language :: Python :: 3.10",
22
+ "Programming Language :: Python :: 3.11",
23
+ "Programming Language :: Python :: 3.12",
24
+ "Topic :: Software Development :: Libraries",
25
+ ]
26
+ dependencies = []
27
+
28
+ [project.urls]
29
+ Homepage = "https://github.com/jakeliu/mllang"
30
+ Repository = "https://github.com/jakeliu/mllang"
31
+ Documentation = "https://mllang.com"
32
+ Issues = "https://github.com/jakeliu/mllang/issues"
33
+
34
+ [tool.setuptools.packages.find]
35
+ include = ["mllang*"]
@@ -0,0 +1,4 @@
1
+ [egg_info]
2
+ tag_build =
3
+ tag_date = 0
4
+