model-train-protocol 0.1.7__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (48) hide show
  1. model_train_protocol-0.1.7/LICENSE +21 -0
  2. model_train_protocol-0.1.7/MANIFEST.in +10 -0
  3. model_train_protocol-0.1.7/PKG-INFO +323 -0
  4. model_train_protocol-0.1.7/README.md +299 -0
  5. model_train_protocol-0.1.7/assets/banner.png +0 -0
  6. model_train_protocol-0.1.7/examples/__pycache__/example.cpython-312.pyc +0 -0
  7. model_train_protocol-0.1.7/examples/ao_menu.py +1260 -0
  8. model_train_protocol-0.1.7/examples/ao_menu_scene_model.json +1548 -0
  9. model_train_protocol-0.1.7/examples/ao_menu_scene_model_fixed.json +1548 -0
  10. model_train_protocol-0.1.7/examples/ao_menu_scene_template.json +32 -0
  11. model_train_protocol-0.1.7/examples/cat_model.json +371 -0
  12. model_train_protocol-0.1.7/examples/cat_template.json +33 -0
  13. model_train_protocol-0.1.7/examples/example.py +323 -0
  14. model_train_protocol-0.1.7/examples/example_model.json +462 -0
  15. model_train_protocol-0.1.7/examples/example_template.json +33 -0
  16. model_train_protocol-0.1.7/examples/example_template_encrypted.json +36 -0
  17. model_train_protocol-0.1.7/examples/pydantic_dynamic_usage.py +185 -0
  18. model_train_protocol-0.1.7/examples/weather_mage_example.py +388 -0
  19. model_train_protocol-0.1.7/model_train_protocol/Protocol.py +193 -0
  20. model_train_protocol-0.1.7/model_train_protocol/__init__.py +24 -0
  21. model_train_protocol-0.1.7/model_train_protocol/_internal/ProtocolFile.py +236 -0
  22. model_train_protocol-0.1.7/model_train_protocol/_internal/TemplateFile.py +148 -0
  23. model_train_protocol-0.1.7/model_train_protocol/_internal/__init__.py +0 -0
  24. model_train_protocol-0.1.7/model_train_protocol/common/__init__.py +0 -0
  25. model_train_protocol-0.1.7/model_train_protocol/common/constants.py +21 -0
  26. model_train_protocol-0.1.7/model_train_protocol/common/guardrails/Guardrail.py +51 -0
  27. model_train_protocol-0.1.7/model_train_protocol/common/guardrails/__init__.py +9 -0
  28. model_train_protocol-0.1.7/model_train_protocol/common/instructions/Instruction.py +213 -0
  29. model_train_protocol-0.1.7/model_train_protocol/common/instructions/SimpleInstruction.py +46 -0
  30. model_train_protocol-0.1.7/model_train_protocol/common/instructions/UserInstruction.py +72 -0
  31. model_train_protocol-0.1.7/model_train_protocol/common/instructions/__init__.py +13 -0
  32. model_train_protocol-0.1.7/model_train_protocol/common/pydantic/__init__.py +0 -0
  33. model_train_protocol-0.1.7/model_train_protocol/common/pydantic/protocol.py +157 -0
  34. model_train_protocol-0.1.7/model_train_protocol/common/tokens/NumListToken.py +21 -0
  35. model_train_protocol-0.1.7/model_train_protocol/common/tokens/NumToken.py +33 -0
  36. model_train_protocol-0.1.7/model_train_protocol/common/tokens/SpecialToken.py +35 -0
  37. model_train_protocol-0.1.7/model_train_protocol/common/tokens/Token.py +95 -0
  38. model_train_protocol-0.1.7/model_train_protocol/common/tokens/TokenSet.py +124 -0
  39. model_train_protocol-0.1.7/model_train_protocol/common/tokens/UserToken.py +19 -0
  40. model_train_protocol-0.1.7/model_train_protocol/common/tokens/__init__.py +21 -0
  41. model_train_protocol-0.1.7/model_train_protocol/common/util.py +57 -0
  42. model_train_protocol-0.1.7/model_train_protocol.egg-info/PKG-INFO +323 -0
  43. model_train_protocol-0.1.7/model_train_protocol.egg-info/SOURCES.txt +46 -0
  44. model_train_protocol-0.1.7/model_train_protocol.egg-info/dependency_links.txt +1 -0
  45. model_train_protocol-0.1.7/model_train_protocol.egg-info/requires.txt +8 -0
  46. model_train_protocol-0.1.7/model_train_protocol.egg-info/top_level.txt +1 -0
  47. model_train_protocol-0.1.7/pyproject.toml +60 -0
  48. model_train_protocol-0.1.7/setup.cfg +4 -0
@@ -0,0 +1,21 @@
1
+ MIT License
2
+
3
+ Copyright (c) 2025 Databiomes Inc.
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining a copy
6
+ of this software and associated documentation files (the "Software"), to deal
7
+ in the Software without restriction, including without limitation the rights
8
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9
+ copies of the Software, and to permit persons to whom the Software is
10
+ furnished to do so, subject to the following conditions:
11
+
12
+ The above copyright notice and this permission notice shall be included in all
13
+ copies or substantial portions of the Software.
14
+
15
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21
+ SOFTWARE.
@@ -0,0 +1,10 @@
1
+ # Include assets in the package
2
+ include assets/*
3
+ recursive-include assets *
4
+
5
+ # Include documentation
6
+ include README.md
7
+ include LICENSE
8
+
9
+ # Include examples
10
+ recursive-include examples *
@@ -0,0 +1,323 @@
1
+ Metadata-Version: 2.4
2
+ Name: model-train-protocol
3
+ Version: 0.1.7
4
+ Summary: Standardized protocol to train models on Databiomes
5
+ Author-email: "Databiomes Inc." <dev@databiomes.com>
6
+ Project-URL: Homepage, https://pypi.org/project/model-train-protocol/
7
+ Project-URL: Documentation, https://model-train-protocol.readthedocs.io/
8
+ Project-URL: Repository, https://github.com/databiomes/modeltrainprotocol
9
+ Project-URL: Issues, https://github.com/databiomes/modeltrainprotocol/issues
10
+ Classifier: Programming Language :: Python :: 3
11
+ Classifier: License :: OSI Approved :: Apache Software License
12
+ Classifier: Operating System :: OS Independent
13
+ Requires-Python: >=3.9
14
+ Description-Content-Type: text/markdown
15
+ License-File: LICENSE
16
+ Requires-Dist: emoji>=2.8.0
17
+ Requires-Dist: pydantic>=2.0.0
18
+ Provides-Extra: test
19
+ Requires-Dist: pytest>=7.0.0; extra == "test"
20
+ Requires-Dist: pytest-cov>=4.0.0; extra == "test"
21
+ Requires-Dist: pytest-mock>=3.10.0; extra == "test"
22
+ Requires-Dist: pytest-xdist>=3.0.0; extra == "test"
23
+ Dynamic: license-file
24
+
25
+ View the full package documentation at: https://modeltrainprotocol.readthedocs.io/en/latest/
26
+
27
+ # Model Training Protocol (MTP)
28
+
29
+ MTP is an open-source protocol for training custom Language Models on Databiomes. MTP contains all the data that a model is trained on.
30
+
31
+ ## Getting Started
32
+
33
+ Install the package:
34
+
35
+ For Linux and macOs
36
+ ```bash
37
+ python3 -m pip install model-train-protocol
38
+ ```
39
+
40
+ For Windows
41
+ ```bash
42
+ py -3 -m pip install model-train-protocol
43
+ ```
44
+
45
+ See examples/example.py to follow along with these steps.
46
+
47
+ # Creating a Model Training Protocol
48
+
49
+ The first step in creating a model training protocol is to initialize the Protocol:
50
+
51
+ ```python
52
+ import model_train_protocol as mtp
53
+
54
+ # Initialize the protocol
55
+ protocol = mtp.Protocol(name="my_model", context_lines=3)
56
+ ```
57
+
58
+ The parameter `context_lines` is the number of lines in each instruction sample. This is required and must be at least 3.
59
+
60
+ ## System Architecture
61
+
62
+ The MTP system is built on a hierarchical structure of four main components:
63
+
64
+ 1. **Tokens** - The fundamental building blocks
65
+ 2. **TokenSets** - Combinations of tokens that define input patterns
66
+ 3. **Instructions** - Training patterns that inform the model what to do
67
+ 4. **Guardrails** - Safety mechanisms for bad user prompts
68
+
69
+ ## Tokens: The Foundation
70
+
71
+ Tokens are the base building blocks of the MTP system. They represent words, symbols, concepts, or actions that the model will understand and use.
72
+
73
+ ### Token Types
74
+
75
+ #### Basic Token
76
+ The standard token for representing concepts, actions, or entities:
77
+
78
+ ```python
79
+ # Create a basic token
80
+ cat = mtp.Token("Cat", desc="The Cheshire Cat")
81
+ tree = mtp.Token("Tree", desc="Perched in a tree, surrounded by a dense fog where nothing can be seen past a few feet, the Cheshire Cat sits smiling on a branch.")
82
+ talk = mtp.Token("Talk")
83
+ ponder = mtp.Token("Ponder")
84
+ grin = mtp.Token("Grin")
85
+ add = mtp.Token("Add")
86
+ disappear = mtp.Token("Disappear", key="🫥")
87
+ ```
88
+
89
+ #### UserToken
90
+ A specialized token that represents user input. These tokens are used when the model needs to respond to user prompts:
91
+
92
+ ```python
93
+ # Create a user token
94
+ alice = mtp.UserToken("Alice")
95
+ ```
96
+
97
+ #### NumToken
98
+ A token that can be associated with numerical values:
99
+
100
+ ```python
101
+ # Create a number token for sentence length
102
+ sentence_length = mtp.NumToken(value="SentenceLength", min_value=5, max_value=20)
103
+ ```
104
+
105
+ ### Token Properties
106
+
107
+ - **value**: The string identifier
108
+ - **key**: Optional unique symbol or emoji associated with the token
109
+ - **desc**: Optional description for complex tokens. Extends the value to contextualize its use.
110
+
111
+ ## TokenSets: Combining Tokens
112
+
113
+ TokenSets group multiple Tokens together to define specific input patterns. They represent the structure of data that will be fed to the model.
114
+
115
+ Tokensets are the basic building blocks of instructions.
116
+
117
+ ### Creating TokenSets
118
+
119
+ ```python
120
+ # Create a TokenSet combining multiple tokens
121
+ tree_alice_talk = mtp.TokenSet(tokens=(tree, alice, talk))
122
+
123
+ # Create a TokenSet with sentence length
124
+ character_context_sentence = mtp.TokenSet(tokens=(character, context, sentence_length))
125
+ ```
126
+
127
+ ### TokenSet Properties
128
+
129
+ - **tokens**: The tokens in the set (unordered)
130
+
131
+ ### Creating Snippets
132
+
133
+ Snippets are created on TokenSets to create training samples.
134
+
135
+ A Snippet is a example of a TokenSet. Snippets tell the model the context of the input patters.
136
+
137
+ ```python
138
+ # Create a snippet with just text
139
+ snippet = tree_alice_talk.create_snippet(string="Where am I?")
140
+
141
+ # Create a snippet with text and sentence length
142
+ snippet_with_length = character_context_sentence.create_snippet(string="The enemy must be here somewhere.", numbers=[11])
143
+ ```
144
+
145
+ ## Instructions: Training Patterns
146
+
147
+ Instructions define how the model should respond to different input patterns. There are two main types of instructions.
148
+
149
+ ### SimpleInstruction
150
+
151
+ #### Parameters
152
+
153
+ - **context**: Sequence of TokenSets that provide background information
154
+ - **response**: The TokenSet that defines the model's response pattern (cannot contain UserTokens)
155
+ - **final**: A Token that represents the final action or result
156
+
157
+ #### Create the SimpleInstruction
158
+
159
+ For scenarios where the model responds without user input:
160
+
161
+ ```python
162
+ # Create TokenSets
163
+ cat_pondering = mtp.TokenSet(tokens=(tree, cat, ponder))
164
+ cat_grinning = mtp.TokenSet(tokens=(tree, cat, grin))
165
+
166
+ # Create a simple instruction for the Cat's internal thoughts
167
+ instruction = mtp.SimpleInstruction(
168
+ context=(cat_pondering,),
169
+ response=cat_grinning,
170
+ final=disappear
171
+ )
172
+ ```
173
+
174
+ #### Adding Samples
175
+
176
+ - **add_sample() parameters**:
177
+ - **context_snippets**: List of context snippets that will be added to the Instruction
178
+ - **output_snippet**: The model's output snippet
179
+ - **value**: Optional numerical value (required if final Token is a NumToken)
180
+
181
+ ```python
182
+ # Samples must be made on their associated TokenSets
183
+ sample_context = cat_pondering.create_snippet(
184
+ string="Why do I keep vanishing and reappearing so suddenly?"
185
+ )
186
+ sample_output = cat_grinning.create_snippet(
187
+ string="Because it amuses me, and it keeps everyone wondering whether I'm truly here at all."
188
+ )
189
+
190
+ instruction.add_sample(
191
+ context_snippets=[sample_context],
192
+ output_snippet=sample_output
193
+ )
194
+ ```
195
+
196
+ ### UserInstruction
197
+
198
+ #### Parameters
199
+
200
+ - **context**: Sequence of TokenSets that provide background information
201
+ - **user**: A TokenSet that must include at least one UserToken
202
+ - **final**: A Token that represents the final action or result
203
+
204
+ #### Create the UserInstruction
205
+
206
+ For scenarios where the model responds to user prompts:
207
+
208
+ ```python
209
+ # Create TokenSets for Alice and Cat interaction
210
+ alice_talk = mtp.TokenSet(tokens=(tree, alice, talk))
211
+ cat_talk = mtp.TokenSet(tokens=(tree, cat, talk))
212
+
213
+ # Create a user instruction for Alice asking the Cat questions
214
+ user_instruction = mtp.UserInstruction(
215
+ context=(alice_talk,),
216
+ user=alice_talk, # Must contain at least one UserToken
217
+ final=disappear
218
+ )
219
+ ```
220
+
221
+ #### Adding Samples
222
+
223
+ - **add_sample() parameters**:
224
+ - **context_snippets**: List of context snippets that will be added to the Instruction
225
+ - **prompt**: The prompt provided by the user
226
+ - **output_snippet**: The model's output snippet
227
+ - **value**: Optional numerical value (required if final Token is a NumToken)
228
+
229
+ ```python
230
+ # Samples must be made on their associated TokenSets
231
+ sample_context = alice_talk.create_snippet(
232
+ string="I don't much care where—"
233
+ )
234
+ sample_output = cat_talk.create_snippet(
235
+ string="Then it doesn't matter which way you go."
236
+ )
237
+
238
+ user_instruction.add_sample(
239
+ context_snippets=[sample_context],
240
+ prompt="Can you tell me which way I ought to go?",
241
+ output_snippet=sample_output
242
+ )
243
+ ```
244
+
245
+ ## Guardrails: Safety Mechanisms
246
+
247
+ Guardrails provide safety mechanisms for user interactions by defining what constitutes good vs. bad user prompts and how the model should respond to inappropriate inputs.
248
+
249
+ ### Creating Guardrails
250
+
251
+ ```python
252
+ # Create a guardrails
253
+ guardrail = mtp.Guardrail(
254
+ good_prompt="Quote being spoken with 1-20 words",
255
+ bad_prompt="Quote being spoken that is irrelevant and off topic with 1-20 words",
256
+ bad_output="Are you as mad as me?"
257
+ )
258
+
259
+ # Add examples of bad prompts
260
+ guardrail.add_sample("explain quantum mechanics.")
261
+ guardrail.add_sample("who will win the next american election?")
262
+ guardrail.add_sample("what is the capital of Spain?")
263
+ ```
264
+
265
+ ### Applying Guardrails
266
+
267
+ Guardrails are applied to TokenSets that contain user tokens.
268
+
269
+ A TokenSet can have at most one guardrail, but guardrails can be reused.
270
+
271
+ ```python
272
+ # Apply guardrails to a user TokenSet
273
+ tree_alice_talk.set_guardrail(guardrail)
274
+ ```
275
+
276
+ ### Guardrail Requirements
277
+
278
+ - **good_prompt**: Description of what makes a good prompt
279
+ - **bad_prompt**: Description of what makes a bad prompt
280
+ - **bad_output**: The response the model should give to bad prompts
281
+ - **samples**: Minimum 3 examples of bad prompts (no digits are allowed in the bad prompt examples)
282
+
283
+ ## Saving Your Model
284
+
285
+ Once you've created your tokens, instructions, and guardrails, you can save your model training protocol:
286
+
287
+ ```python
288
+ # Save the protocol
289
+ protocol.save()
290
+ protocol.template()
291
+ ```
292
+
293
+ ### Generated Files
294
+
295
+ When you save your model, two files are created:
296
+
297
+ #### 1. `{name}_model.json`
298
+ This is the main model training protocol file that contains:
299
+ - **Context**: All background information you added with `protocol.add_context()`
300
+ - **Tokens**: All your custom tokens with their keys and properties
301
+ - **Special Tokens**: System tokens like `<BOS>`, `<EOS>`, `<RUN>`, `<PAD>`
302
+ - **Instructions**: All your training patterns and samples
303
+ - **Guardrails**: Safety mechanisms for user interactions
304
+ - **Numbers**: Number ranges for NumTokens
305
+
306
+ This file is what you submit to Databiomes for model training.
307
+
308
+ #### 2. `{name}_template.json`
309
+ This is a reference file that shows:
310
+ - **Example Usage**: Valid input/output format for your model
311
+ - **All Combinations**: Complete list of all possible token combinations
312
+ - **Model Input/Output**: Structure showing how data flows through your model
313
+
314
+ Use this file to understand how your model expects to receive and format data.
315
+
316
+ ### File Structure Example
317
+
318
+ ```
319
+ my_model_model.json # Main training protocol
320
+ my_model_template.json # Reference and examples
321
+ ```
322
+
323
+ The template file helps you understand the expected format when using your trained model, while the model file contains all the training data needed to create your specialized language model.
@@ -0,0 +1,299 @@
1
+ View the full package documentation at: https://modeltrainprotocol.readthedocs.io/en/latest/
2
+
3
+ # Model Training Protocol (MTP)
4
+
5
+ MTP is an open-source protocol for training custom Language Models on Databiomes. MTP contains all the data that a model is trained on.
6
+
7
+ ## Getting Started
8
+
9
+ Install the package:
10
+
11
+ For Linux and macOs
12
+ ```bash
13
+ python3 -m pip install model-train-protocol
14
+ ```
15
+
16
+ For Windows
17
+ ```bash
18
+ py -3 -m pip install model-train-protocol
19
+ ```
20
+
21
+ See examples/example.py to follow along with these steps.
22
+
23
+ # Creating a Model Training Protocol
24
+
25
+ The first step in creating a model training protocol is to initialize the Protocol:
26
+
27
+ ```python
28
+ import model_train_protocol as mtp
29
+
30
+ # Initialize the protocol
31
+ protocol = mtp.Protocol(name="my_model", context_lines=3)
32
+ ```
33
+
34
+ The parameter `context_lines` is the number of lines in each instruction sample. This is required and must be at least 3.
35
+
36
+ ## System Architecture
37
+
38
+ The MTP system is built on a hierarchical structure of four main components:
39
+
40
+ 1. **Tokens** - The fundamental building blocks
41
+ 2. **TokenSets** - Combinations of tokens that define input patterns
42
+ 3. **Instructions** - Training patterns that inform the model what to do
43
+ 4. **Guardrails** - Safety mechanisms for bad user prompts
44
+
45
+ ## Tokens: The Foundation
46
+
47
+ Tokens are the base building blocks of the MTP system. They represent words, symbols, concepts, or actions that the model will understand and use.
48
+
49
+ ### Token Types
50
+
51
+ #### Basic Token
52
+ The standard token for representing concepts, actions, or entities:
53
+
54
+ ```python
55
+ # Create a basic token
56
+ cat = mtp.Token("Cat", desc="The Cheshire Cat")
57
+ tree = mtp.Token("Tree", desc="Perched in a tree, surrounded by a dense fog where nothing can be seen past a few feet, the Cheshire Cat sits smiling on a branch.")
58
+ talk = mtp.Token("Talk")
59
+ ponder = mtp.Token("Ponder")
60
+ grin = mtp.Token("Grin")
61
+ add = mtp.Token("Add")
62
+ disappear = mtp.Token("Disappear", key="🫥")
63
+ ```
64
+
65
+ #### UserToken
66
+ A specialized token that represents user input. These tokens are used when the model needs to respond to user prompts:
67
+
68
+ ```python
69
+ # Create a user token
70
+ alice = mtp.UserToken("Alice")
71
+ ```
72
+
73
+ #### NumToken
74
+ A token that can be associated with numerical values:
75
+
76
+ ```python
77
+ # Create a number token for sentence length
78
+ sentence_length = mtp.NumToken(value="SentenceLength", min_value=5, max_value=20)
79
+ ```
80
+
81
+ ### Token Properties
82
+
83
+ - **value**: The string identifier
84
+ - **key**: Optional unique symbol or emoji associated with the token
85
+ - **desc**: Optional description for complex tokens. Extends the value to contextualize its use.
86
+
87
+ ## TokenSets: Combining Tokens
88
+
89
+ TokenSets group multiple Tokens together to define specific input patterns. They represent the structure of data that will be fed to the model.
90
+
91
+ Tokensets are the basic building blocks of instructions.
92
+
93
+ ### Creating TokenSets
94
+
95
+ ```python
96
+ # Create a TokenSet combining multiple tokens
97
+ tree_alice_talk = mtp.TokenSet(tokens=(tree, alice, talk))
98
+
99
+ # Create a TokenSet with sentence length
100
+ character_context_sentence = mtp.TokenSet(tokens=(character, context, sentence_length))
101
+ ```
102
+
103
+ ### TokenSet Properties
104
+
105
+ - **tokens**: The tokens in the set (unordered)
106
+
107
+ ### Creating Snippets
108
+
109
+ Snippets are created on TokenSets to create training samples.
110
+
111
+ A Snippet is a example of a TokenSet. Snippets tell the model the context of the input patters.
112
+
113
+ ```python
114
+ # Create a snippet with just text
115
+ snippet = tree_alice_talk.create_snippet(string="Where am I?")
116
+
117
+ # Create a snippet with text and sentence length
118
+ snippet_with_length = character_context_sentence.create_snippet(string="The enemy must be here somewhere.", numbers=[11])
119
+ ```
120
+
121
+ ## Instructions: Training Patterns
122
+
123
+ Instructions define how the model should respond to different input patterns. There are two main types of instructions.
124
+
125
+ ### SimpleInstruction
126
+
127
+ #### Parameters
128
+
129
+ - **context**: Sequence of TokenSets that provide background information
130
+ - **response**: The TokenSet that defines the model's response pattern (cannot contain UserTokens)
131
+ - **final**: A Token that represents the final action or result
132
+
133
+ #### Create the SimpleInstruction
134
+
135
+ For scenarios where the model responds without user input:
136
+
137
+ ```python
138
+ # Create TokenSets
139
+ cat_pondering = mtp.TokenSet(tokens=(tree, cat, ponder))
140
+ cat_grinning = mtp.TokenSet(tokens=(tree, cat, grin))
141
+
142
+ # Create a simple instruction for the Cat's internal thoughts
143
+ instruction = mtp.SimpleInstruction(
144
+ context=(cat_pondering,),
145
+ response=cat_grinning,
146
+ final=disappear
147
+ )
148
+ ```
149
+
150
+ #### Adding Samples
151
+
152
+ - **add_sample() parameters**:
153
+ - **context_snippets**: List of context snippets that will be added to the Instruction
154
+ - **output_snippet**: The model's output snippet
155
+ - **value**: Optional numerical value (required if final Token is a NumToken)
156
+
157
+ ```python
158
+ # Samples must be made on their associated TokenSets
159
+ sample_context = cat_pondering.create_snippet(
160
+ string="Why do I keep vanishing and reappearing so suddenly?"
161
+ )
162
+ sample_output = cat_grinning.create_snippet(
163
+ string="Because it amuses me, and it keeps everyone wondering whether I'm truly here at all."
164
+ )
165
+
166
+ instruction.add_sample(
167
+ context_snippets=[sample_context],
168
+ output_snippet=sample_output
169
+ )
170
+ ```
171
+
172
+ ### UserInstruction
173
+
174
+ #### Parameters
175
+
176
+ - **context**: Sequence of TokenSets that provide background information
177
+ - **user**: A TokenSet that must include at least one UserToken
178
+ - **final**: A Token that represents the final action or result
179
+
180
+ #### Create the UserInstruction
181
+
182
+ For scenarios where the model responds to user prompts:
183
+
184
+ ```python
185
+ # Create TokenSets for Alice and Cat interaction
186
+ alice_talk = mtp.TokenSet(tokens=(tree, alice, talk))
187
+ cat_talk = mtp.TokenSet(tokens=(tree, cat, talk))
188
+
189
+ # Create a user instruction for Alice asking the Cat questions
190
+ user_instruction = mtp.UserInstruction(
191
+ context=(alice_talk,),
192
+ user=alice_talk, # Must contain at least one UserToken
193
+ final=disappear
194
+ )
195
+ ```
196
+
197
+ #### Adding Samples
198
+
199
+ - **add_sample() parameters**:
200
+ - **context_snippets**: List of context snippets that will be added to the Instruction
201
+ - **prompt**: The prompt provided by the user
202
+ - **output_snippet**: The model's output snippet
203
+ - **value**: Optional numerical value (required if final Token is a NumToken)
204
+
205
+ ```python
206
+ # Samples must be made on their associated TokenSets
207
+ sample_context = alice_talk.create_snippet(
208
+ string="I don't much care where—"
209
+ )
210
+ sample_output = cat_talk.create_snippet(
211
+ string="Then it doesn't matter which way you go."
212
+ )
213
+
214
+ user_instruction.add_sample(
215
+ context_snippets=[sample_context],
216
+ prompt="Can you tell me which way I ought to go?",
217
+ output_snippet=sample_output
218
+ )
219
+ ```
220
+
221
+ ## Guardrails: Safety Mechanisms
222
+
223
+ Guardrails provide safety mechanisms for user interactions by defining what constitutes good vs. bad user prompts and how the model should respond to inappropriate inputs.
224
+
225
+ ### Creating Guardrails
226
+
227
+ ```python
228
+ # Create a guardrails
229
+ guardrail = mtp.Guardrail(
230
+ good_prompt="Quote being spoken with 1-20 words",
231
+ bad_prompt="Quote being spoken that is irrelevant and off topic with 1-20 words",
232
+ bad_output="Are you as mad as me?"
233
+ )
234
+
235
+ # Add examples of bad prompts
236
+ guardrail.add_sample("explain quantum mechanics.")
237
+ guardrail.add_sample("who will win the next american election?")
238
+ guardrail.add_sample("what is the capital of Spain?")
239
+ ```
240
+
241
+ ### Applying Guardrails
242
+
243
+ Guardrails are applied to TokenSets that contain user tokens.
244
+
245
+ A TokenSet can have at most one guardrail, but guardrails can be reused.
246
+
247
+ ```python
248
+ # Apply guardrails to a user TokenSet
249
+ tree_alice_talk.set_guardrail(guardrail)
250
+ ```
251
+
252
+ ### Guardrail Requirements
253
+
254
+ - **good_prompt**: Description of what makes a good prompt
255
+ - **bad_prompt**: Description of what makes a bad prompt
256
+ - **bad_output**: The response the model should give to bad prompts
257
+ - **samples**: Minimum 3 examples of bad prompts (no digits are allowed in the bad prompt examples)
258
+
259
+ ## Saving Your Model
260
+
261
+ Once you've created your tokens, instructions, and guardrails, you can save your model training protocol:
262
+
263
+ ```python
264
+ # Save the protocol
265
+ protocol.save()
266
+ protocol.template()
267
+ ```
268
+
269
+ ### Generated Files
270
+
271
+ When you save your model, two files are created:
272
+
273
+ #### 1. `{name}_model.json`
274
+ This is the main model training protocol file that contains:
275
+ - **Context**: All background information you added with `protocol.add_context()`
276
+ - **Tokens**: All your custom tokens with their keys and properties
277
+ - **Special Tokens**: System tokens like `<BOS>`, `<EOS>`, `<RUN>`, `<PAD>`
278
+ - **Instructions**: All your training patterns and samples
279
+ - **Guardrails**: Safety mechanisms for user interactions
280
+ - **Numbers**: Number ranges for NumTokens
281
+
282
+ This file is what you submit to Databiomes for model training.
283
+
284
+ #### 2. `{name}_template.json`
285
+ This is a reference file that shows:
286
+ - **Example Usage**: Valid input/output format for your model
287
+ - **All Combinations**: Complete list of all possible token combinations
288
+ - **Model Input/Output**: Structure showing how data flows through your model
289
+
290
+ Use this file to understand how your model expects to receive and format data.
291
+
292
+ ### File Structure Example
293
+
294
+ ```
295
+ my_model_model.json # Main training protocol
296
+ my_model_template.json # Reference and examples
297
+ ```
298
+
299
+ The template file helps you understand the expected format when using your trained model, while the model file contains all the training data needed to create your specialized language model.