tep 0.11.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (193) hide show
  1. checksums.yaml +7 -0
  2. data/LICENSE +21 -0
  3. data/Makefile +134 -0
  4. data/README.md +247 -0
  5. data/SINATRA_COMPAT.md +376 -0
  6. data/bin/tep +2156 -0
  7. data/examples/agentic_chat/README.md +103 -0
  8. data/examples/agentic_chat/app.rb +310 -0
  9. data/examples/api_gateway/README.md +49 -0
  10. data/examples/api_gateway/app.rb +66 -0
  11. data/examples/blog/app.rb +367 -0
  12. data/examples/blog/views/index.erb +36 -0
  13. data/examples/blog/views/login.erb +28 -0
  14. data/examples/blog/views/new_post.erb +25 -0
  15. data/examples/blog/views/show.erb +16 -0
  16. data/examples/chat/app.rb +278 -0
  17. data/examples/chat/assets/logo.svg +13 -0
  18. data/examples/chat/assets/style.css +209 -0
  19. data/examples/chat/views/index.erb +142 -0
  20. data/examples/chatbot/README.md +111 -0
  21. data/examples/chatbot/app.rb +1024 -0
  22. data/examples/chatbot/assets/chat.js +249 -0
  23. data/examples/chatbot/assets/compare.js +93 -0
  24. data/examples/chatbot/assets/markdown.js +84 -0
  25. data/examples/chatbot/assets/style.css +215 -0
  26. data/examples/chatbot/schema.sql +25 -0
  27. data/examples/chatbot/views/compare.erb +43 -0
  28. data/examples/chatbot/views/index.erb +42 -0
  29. data/examples/chatbot/views/login.erb +22 -0
  30. data/examples/chatbot/views/setup.erb +23 -0
  31. data/examples/counter/README.md +68 -0
  32. data/examples/counter/app.rb +85 -0
  33. data/examples/experiments/AGENTS.md +91 -0
  34. data/examples/experiments/README.md +99 -0
  35. data/examples/experiments/app.rb +225 -0
  36. data/examples/geohash/Gemfile +11 -0
  37. data/examples/geohash/Gemfile.lock +17 -0
  38. data/examples/geohash/README.md +58 -0
  39. data/examples/geohash/app.rb +33 -0
  40. data/examples/hello.rb +120 -0
  41. data/examples/llm_gateway/README.md +73 -0
  42. data/examples/llm_gateway/app.rb +91 -0
  43. data/examples/maidenhead/Gemfile +7 -0
  44. data/examples/maidenhead/Gemfile.lock +17 -0
  45. data/examples/maidenhead/README.md +47 -0
  46. data/examples/maidenhead/app.rb +46 -0
  47. data/examples/pg_hello.rb +76 -0
  48. data/examples/qdrant/Gemfile +11 -0
  49. data/examples/qdrant/Gemfile.lock +29 -0
  50. data/examples/qdrant/README.md +54 -0
  51. data/examples/sinatra_style.rb +32 -0
  52. data/examples/websocket_echo.rb +37 -0
  53. data/lib/tep/agent_delegation.rb +35 -0
  54. data/lib/tep/app.rb +291 -0
  55. data/lib/tep/assets.rb +52 -0
  56. data/lib/tep/auth.rb +78 -0
  57. data/lib/tep/auth_bearer_token.rb +126 -0
  58. data/lib/tep/auth_oauth2.rb +189 -0
  59. data/lib/tep/auth_oauth2_client.rb +29 -0
  60. data/lib/tep/auth_oauth2_code.rb +40 -0
  61. data/lib/tep/auth_session_cookie.rb +132 -0
  62. data/lib/tep/broadcast.rb +265 -0
  63. data/lib/tep/broadcast_subscription.rb +42 -0
  64. data/lib/tep/cache.rb +49 -0
  65. data/lib/tep/events.rb +257 -0
  66. data/lib/tep/filter.rb +21 -0
  67. data/lib/tep/handler.rb +35 -0
  68. data/lib/tep/http.rb +599 -0
  69. data/lib/tep/identity.rb +67 -0
  70. data/lib/tep/job.rb +186 -0
  71. data/lib/tep/json.rb +572 -0
  72. data/lib/tep/jwt.rb +126 -0
  73. data/lib/tep/live_view.rb +219 -0
  74. data/lib/tep/llm.rb +505 -0
  75. data/lib/tep/logger.rb +85 -0
  76. data/lib/tep/mcp.rb +203 -0
  77. data/lib/tep/multipart.rb +98 -0
  78. data/lib/tep/net.rb +155 -0
  79. data/lib/tep/openai_server.rb +725 -0
  80. data/lib/tep/parallel.rb +168 -0
  81. data/lib/tep/parser.rb +81 -0
  82. data/lib/tep/password.rb +102 -0
  83. data/lib/tep/pg.rb +1128 -0
  84. data/lib/tep/presence.rb +589 -0
  85. data/lib/tep/presence_entry.rb +52 -0
  86. data/lib/tep/proxy.rb +801 -0
  87. data/lib/tep/request.rb +194 -0
  88. data/lib/tep/response.rb +134 -0
  89. data/lib/tep/router.rb +137 -0
  90. data/lib/tep/scheduler.rb +342 -0
  91. data/lib/tep/security.rb +140 -0
  92. data/lib/tep/server.rb +276 -0
  93. data/lib/tep/server_scheduled.rb +375 -0
  94. data/lib/tep/session.rb +98 -0
  95. data/lib/tep/shell.rb +62 -0
  96. data/lib/tep/sphttp.c +858 -0
  97. data/lib/tep/sqlite.rb +215 -0
  98. data/lib/tep/streamer.rb +31 -0
  99. data/lib/tep/tep_pg.c +769 -0
  100. data/lib/tep/tep_sqlite.c +320 -0
  101. data/lib/tep/url.rb +161 -0
  102. data/lib/tep/version.rb +3 -0
  103. data/lib/tep/websocket/connection.rb +171 -0
  104. data/lib/tep/websocket/driver.rb +169 -0
  105. data/lib/tep/websocket/frame.rb +238 -0
  106. data/lib/tep/websocket/handshake.rb +159 -0
  107. data/lib/tep/websocket.rb +68 -0
  108. data/lib/tep.rb +981 -0
  109. data/public/hello.txt +1 -0
  110. data/public/style.css +4 -0
  111. data/spinel-ext.json +33 -0
  112. data/test/helper.rb +248 -0
  113. data/test/real_world/01_simple.rb +5 -0
  114. data/test/real_world/02_lifecycle.rb +20 -0
  115. data/test/real_world/03_chat.rb +75 -0
  116. data/test/real_world/04_health_api.rb +25 -0
  117. data/test/real_world/05_todo_api.rb +57 -0
  118. data/test/real_world/06_basic_auth.rb +25 -0
  119. data/test/real_world/07_bbc_rest_api.rb +228 -0
  120. data/test/real_world/07_sklise_things.rb +109 -0
  121. data/test/real_world/08_jwd83_helloworld.rb +56 -0
  122. data/test/run_all.rb +7 -0
  123. data/test/run_parallel.rb +89 -0
  124. data/test/spinel_scheduled_burst_segv_repro.rb +33 -0
  125. data/test/test_api_gateway.rb +76 -0
  126. data/test/test_auth.rb +223 -0
  127. data/test/test_auth_oauth2.rb +208 -0
  128. data/test/test_auth_session_cookie.rb +198 -0
  129. data/test/test_broadcast.rb +197 -0
  130. data/test/test_broadcast_pg.rb +135 -0
  131. data/test/test_cache.rb +98 -0
  132. data/test/test_cache_static.rb +48 -0
  133. data/test/test_cookies.rb +52 -0
  134. data/test/test_erb.rb +53 -0
  135. data/test/test_erb_ivars.rb +58 -0
  136. data/test/test_events.rb +114 -0
  137. data/test/test_filters.rb +41 -0
  138. data/test/test_geohash_example.rb +89 -0
  139. data/test/test_http.rb +137 -0
  140. data/test/test_http_pool.rb +122 -0
  141. data/test/test_http_pool_send.rb +57 -0
  142. data/test/test_identity.rb +165 -0
  143. data/test/test_inbound_tls.rb +101 -0
  144. data/test/test_inbound_tls_scheduled.rb +101 -0
  145. data/test/test_job.rb +108 -0
  146. data/test/test_json.rb +168 -0
  147. data/test/test_jwt.rb +143 -0
  148. data/test/test_live_view.rb +324 -0
  149. data/test/test_llm.rb +250 -0
  150. data/test/test_llm_gateway.rb +95 -0
  151. data/test/test_logger.rb +101 -0
  152. data/test/test_maidenhead_example.rb +86 -0
  153. data/test/test_mcp.rb +264 -0
  154. data/test/test_misc_v02.rb +54 -0
  155. data/test/test_modular.rb +43 -0
  156. data/test/test_multi_filters.rb +40 -0
  157. data/test/test_mustache.rb +57 -0
  158. data/test/test_openai_server.rb +598 -0
  159. data/test/test_optional_segments.rb +45 -0
  160. data/test/test_parallel.rb +102 -0
  161. data/test/test_params.rb +99 -0
  162. data/test/test_pass.rb +42 -0
  163. data/test/test_password.rb +101 -0
  164. data/test/test_pg.rb +673 -0
  165. data/test/test_presence.rb +374 -0
  166. data/test/test_presence_pg.rb +309 -0
  167. data/test/test_proxy.rb +556 -0
  168. data/test/test_proxy_dsl.rb +119 -0
  169. data/test/test_proxy_streaming.rb +146 -0
  170. data/test/test_real_world.rb +397 -0
  171. data/test/test_regex_routes.rb +52 -0
  172. data/test/test_request_methods.rb +102 -0
  173. data/test/test_response.rb +123 -0
  174. data/test/test_routing.rb +109 -0
  175. data/test/test_scheduler.rb +153 -0
  176. data/test/test_security.rb +72 -0
  177. data/test/test_server_scheduled.rb +56 -0
  178. data/test/test_sessions.rb +59 -0
  179. data/test/test_shell.rb +54 -0
  180. data/test/test_sqlite.rb +148 -0
  181. data/test/test_sqlite_cached.rb +171 -0
  182. data/test/test_static.rb +57 -0
  183. data/test/test_streaming.rb +96 -0
  184. data/test/test_unsupported.rb +32 -0
  185. data/test/test_websocket.rb +152 -0
  186. data/test/test_websocket_echo.rb +138 -0
  187. data/test/views/greet.erb +5 -0
  188. data/test/views/hello.erb +5 -0
  189. data/test/views/list.erb +5 -0
  190. data/test/views/m_ivars.mustache +3 -0
  191. data/test/views/m_simple.mustache +4 -0
  192. data/test/views/mixed.erb +3 -0
  193. metadata +264 -0
@@ -0,0 +1,725 @@
1
+ # Tep::Llm::OpenAI::Server -- serve OpenAI-compatible HTTP from local
2
+ # compute (Battery 7). Unlike Tep::Proxy there's no upstream: the route
3
+ # + events shell is tep, the actual inference is a pluggable Backend an
4
+ # app supplies. See docs/OPENAI-SERVER-BATTERY.md.
5
+ #
6
+ # Chunk 7.1a (this file): the Backend interface apps subclass, the
7
+ # Server.use / .serve! DSL, and GET /v1/models. Token-level completions
8
+ # (/v1/completions), events emission, and streaming land in later
9
+ # chunks (7.1b / 7.2).
10
+ #
11
+ # class ToyBackend < Tep::Llm::OpenAI::Backend
12
+ # def list_models; ["smollm2-135m"]; end
13
+ # # generate_from_tokens / device_kind / ... overridden as needed
14
+ # end
15
+ # Tep::Llm::OpenAI::Server.use(ToyBackend.new)
16
+ # Tep::Llm::OpenAI::Server.serve!
17
+ #
18
+ # Why subclass-and-override + `use(ConcreteBackend.new)`: the concrete
19
+ # instance flows into the APP.openai_backend slot from the user's
20
+ # `.new`, so spinel's observed-class set includes it and the route's
21
+ # `APP.openai_backend.list_models` dispatches to the override (verified
22
+ # spike). Same shape Tep::LiveView uses for its view instances.
23
+ module Tep
24
+ class Llm
25
+ module OpenAI
26
+ # The interface an app's backend implements. Defaults make a
27
+ # bare backend safe to compile + serve (empty model list, chat
28
+ # unsupported, cpu device). Subclasses override what they offer.
29
+ class Backend
30
+ # Available model names -> [String]. /v1/models wraps these.
31
+ def list_models
32
+ empty = [""]
33
+ empty.delete_at(0)
34
+ empty
35
+ end
36
+
37
+ # PRIMARY shape: token-level generation (maps to
38
+ # /v1/completions, non-streaming). `token_ids` is the encoded
39
+ # prompt (Array[Integer]); `sampling` is a
40
+ # Tep::Llm::OpenAI::Sampling. Returns a
41
+ # Tep::Llm::OpenAI::Completion (text + usage). The base returns
42
+ # an empty completion so a bare backend compiles; real backends
43
+ # override.
44
+ def generate_from_tokens(model, token_ids, sampling)
45
+ Tep::Llm::OpenAI::Completion.new
46
+ end
47
+
48
+ # STREAMING shape (7.2): the per-token variant for SSE
49
+ # /v1/completions when the request carries "stream": true.
50
+ # The backend writes each token to `sink` via
51
+ # sink.emit_token(piece); the sink (Tep::Llm::OpenAI::StreamSink)
52
+ # formats it as an OpenAI SSE frame and writes to the
53
+ # outbound chunked stream. Blocks/yields don't lower across the
54
+ # spinel boundary, so a typed sink replaces the block --
55
+ # backends never see SSE wire format or the client fd.
56
+ # Base no-op (subclasses override).
57
+ def generate_stream_from_tokens(model, token_ids, sampling, sink)
58
+ 0
59
+ end
60
+
61
+ # Does this backend implement message-level (chat) generation?
62
+ # When false, /v1/chat/completions returns 501. (The chat
63
+ # template is per-model + an ML concern; tep doesn't ship one.)
64
+ def supports_chat?
65
+ false
66
+ end
67
+
68
+ # Message-level (chat) generation. Mirrors generate_from_tokens
69
+ # but receives the raw req so the backend can parse the
70
+ # messages array itself + apply its own chat template. Tep
71
+ # doesn't pre-build a Message[] because templating + role
72
+ # ordering is per-model; the JSON tools live in Tep::Json. The
73
+ # return is reused from the token path (text becomes the
74
+ # assistant message's content). Base no-op; subclasses override.
75
+ # Only reached when supports_chat? returns true -- the handler
76
+ # gates with a 501 otherwise.
77
+ def chat_completion(req)
78
+ Tep::Llm::OpenAI::Completion.new
79
+ end
80
+
81
+ # Streaming chat (#127). Per-token variant for SSE
82
+ # /v1/chat/completions when the request carries "stream":true.
83
+ # Backend writes each token to `sink` via sink.emit_token(piece);
84
+ # the sink formats it as the OpenAI chat-streaming delta frame
85
+ # and writes one chunked frame. Same subclass-override-sink
86
+ # pattern as 7.2 (generate_stream_from_tokens). Base no-op.
87
+ def chat_completion_stream(req, sink)
88
+ 0
89
+ end
90
+
91
+ # Backend's device, surfaced into the run_start event's
92
+ # backend.kind at serve! time. Defaults to cpu.
93
+ def device_kind
94
+ "cpu"
95
+ end
96
+
97
+ # Backends that can embed override this -> true (gates
98
+ # /v1/embeddings, chunk 7.3).
99
+ def supports_embeddings?
100
+ false
101
+ end
102
+
103
+ # Embedding generation for /v1/embeddings. `token_ids` is the
104
+ # encoded input (Array[Integer]; this server speaks IDs only,
105
+ # tokenize client-side, same policy as generate_from_tokens).
106
+ # Returns the pooled embedding as an Array[Float] of length
107
+ # d_model -- the backend owns the lookup + pooling strategy
108
+ # (toy mean-pools per-token embeddings). Base returns an empty
109
+ # vector so a bare backend compiles; only reached when
110
+ # supports_embeddings? is true (EmbeddingsHandler gates 501).
111
+ def generate_embeddings(model, token_ids)
112
+ empty = [0.0]
113
+ empty.delete_at(0)
114
+ empty
115
+ end
116
+ end
117
+
118
+ # The mountable server. Class methods because an app wires one
119
+ # backend per process at boot (`use`) then mounts the standard
120
+ # routes (`serve!`).
121
+ class Server
122
+ # Register the app's backend. Pass a concrete Backend subclass
123
+ # instance; it's stored on Tep::APP and dispatched per request.
124
+ def self.use(backend)
125
+ Tep::APP.set_openai_backend(backend)
126
+ 0
127
+ end
128
+
129
+ # Mount the standard OpenAI routes + (optionally) start the
130
+ # toy/v1 events stream. `events_jsonl` is a JSONL path the
131
+ # per-request inference event + the run_start at boot append
132
+ # to; an empty path (the default) disables emission with zero
133
+ # overhead. Backwards-compatible with the 7.1a/b no-arg form.
134
+ def self.serve!(events_jsonl = "")
135
+ events = Tep::Events.new(events_jsonl)
136
+ Tep::APP.set_openai_events(events)
137
+ host = ENV["HOSTNAME"]
138
+ if host.length == 0
139
+ host = "tep"
140
+ end
141
+ # backend.device_kind => the run_start's `backend.kind`; reads
142
+ # the backend via APP.openai_backend so a `use`d subclass's
143
+ # override answers (e.g. ToyBackend returning "cuda").
144
+ backend_kind = Tep::APP.openai_backend.device_kind
145
+ config_json = "{" +
146
+ Tep::Json.encode_pair_str("server", "tep-llm-openai") + "," +
147
+ Tep::Json.encode_pair_str("events_jsonl", events_jsonl) +
148
+ "}"
149
+ events.run_start(host, backend_kind, "", "", config_json)
150
+ Tep.get("/v1/models", Tep::Llm::OpenAI::ModelsHandler.new)
151
+ Tep.post("/v1/completions", Tep::Llm::OpenAI::CompletionsHandler.new)
152
+ Tep.post("/v1/chat/completions", Tep::Llm::OpenAI::ChatCompletionsHandler.new)
153
+ # Always mounted; the handler 501s when supports_embeddings?
154
+ # is false (same gate shape as chat completions).
155
+ Tep.post("/v1/embeddings", Tep::Llm::OpenAI::EmbeddingsHandler.new)
156
+ 0
157
+ end
158
+ end
159
+
160
+ # Parse the `messages` array from an OpenAI chat request body.
161
+ # Returns [Tep::Llm::Message, ...] (one per `{role, content}`
162
+ # object); empty if the key is missing or the value isn't an
163
+ # array.
164
+ #
165
+ # Helper for `chat_completion(req)` overrides — backends that
166
+ # need the parsed messages array (most do, for applying their
167
+ # chat template) can call this instead of writing their own
168
+ # JSON walker:
169
+ #
170
+ # def chat_completion(req)
171
+ # messages = Tep::Llm::OpenAI.parse_messages(req.raw_body)
172
+ # # ...apply template, tokenize, generate...
173
+ # end
174
+ #
175
+ # Honors only `role` + `content` (the v1 fields). Other fields
176
+ # in the message object (e.g. `name`, `tool_calls`) are ignored
177
+ # for now; future chunks may extend the shape.
178
+ def self.parse_messages(body)
179
+ out = [Tep::Llm::Message.new("", "")]
180
+ out.delete_at(0)
181
+ pos = Tep::Json.find_value_start(body, "messages")
182
+ if pos < 0
183
+ return out
184
+ end
185
+ pos = Tep::Json.skip_ws(body, pos)
186
+ if pos >= body.length || body[pos] != "["
187
+ return out
188
+ end
189
+ pos += 1
190
+ while pos < body.length
191
+ pos = Tep::Json.skip_ws(body, pos)
192
+ if pos >= body.length
193
+ return out
194
+ end
195
+ c = body[pos]
196
+ if c == "]"
197
+ return out
198
+ end
199
+ if c == ","
200
+ pos += 1
201
+ next
202
+ end
203
+ if c == "{"
204
+ obj_end = Tep::Json.skip_container(body, pos)
205
+ # Parse role + content within this object range. Run two
206
+ # passes scoped via Tep::Json's existing key search: the
207
+ # body-wide find could match a key in a sibling object so
208
+ # we instead walk the bytes between `pos` and `obj_end`
209
+ # manually, looking only for `"role"` / `"content"`.
210
+ role = Tep::Llm::OpenAI.find_obj_key_str(body, pos, obj_end, "role")
211
+ cont = Tep::Llm::OpenAI.find_obj_key_str(body, pos, obj_end, "content")
212
+ out.push(Tep::Llm::Message.new(role, cont))
213
+ pos = obj_end
214
+ else
215
+ pos = Tep::Json.skip_value(body, pos)
216
+ end
217
+ end
218
+ out
219
+ end
220
+
221
+ # Scan body[obj_start..obj_end) for `"key":"<value>"` and return
222
+ # the unescaped value. Returns "" if the key isn't present. Used
223
+ # by parse_messages above to extract per-message fields without
224
+ # crossing into adjacent message objects.
225
+ def self.find_obj_key_str(body, obj_start, obj_end, key)
226
+ needle = "\"" + key + "\""
227
+ pos = Tep.str_find(body, needle, obj_start)
228
+ if pos < 0 || pos >= obj_end
229
+ return ""
230
+ end
231
+ pos = pos + needle.length
232
+ pos = Tep::Json.skip_ws(body, pos)
233
+ if pos >= obj_end || body[pos] != ":"
234
+ return ""
235
+ end
236
+ pos += 1
237
+ pos = Tep::Json.skip_ws(body, pos)
238
+ if pos >= obj_end
239
+ return ""
240
+ end
241
+ Tep::Json.parse_str_value(body, pos)
242
+ end
243
+
244
+ # Sampling parameters handed to the backend. v1 carries
245
+ # max_tokens + temperature + top_p (the three OpenAI completion
246
+ # knobs every client sets). Floats parsed via Tep::Json.get_float.
247
+ # Defaults match OpenAI's API defaults so a backend that ignores
248
+ # sampling gets pass-through behavior.
249
+ class Sampling
250
+ attr_accessor :max_tokens, :temperature, :top_p
251
+
252
+ def initialize
253
+ @max_tokens = 0
254
+ @temperature = 1.0
255
+ @top_p = 1.0
256
+ end
257
+ end
258
+
259
+ # A backend's generation result: the decoded text + token usage.
260
+ class Completion
261
+ attr_accessor :text, :prompt_tokens, :completion_tokens
262
+
263
+ def initialize
264
+ @text = ""
265
+ @prompt_tokens = 0
266
+ @completion_tokens = 0
267
+ end
268
+ end
269
+
270
+ # The per-token write surface a streaming backend uses (7.2). One
271
+ # method: `emit_token(piece)`. The sink formats `piece` as an
272
+ # OpenAI text-completion SSE frame and writes one chunked frame
273
+ # to the outbound stream. Counts emitted tokens for the
274
+ # inference event's completion_tokens.
275
+ #
276
+ # Why a sink object instead of a block: spinel can't lower a
277
+ # block parameter across the backend call boundary; a typed
278
+ # object with one method does the same job through ordinary
279
+ # virtual dispatch.
280
+ class StreamSink
281
+ attr_accessor :out, :model, :completion_count
282
+
283
+ def initialize
284
+ @model = ""
285
+ @completion_count = 0
286
+ end
287
+
288
+ # Write one SSE event carrying a single text delta. Matches
289
+ # OpenAI's text_completion streaming shape: one choices[].text
290
+ # per event, finish_reason: null until the streamer sends
291
+ # [DONE]. created uses Time.now.to_i (epoch seconds).
292
+ def emit_token(piece)
293
+ @completion_count = @completion_count + 1
294
+ frame = "{" +
295
+ Tep::Json.encode_pair_str("id", "cmpl-tep") + "," +
296
+ Tep::Json.encode_pair_str("object", "text_completion") + "," +
297
+ Tep::Json.encode_pair_int("created", Time.now.to_i) + "," +
298
+ Tep::Json.encode_pair_str("model", @model) + "," +
299
+ "\"choices\":[{" +
300
+ Tep::Json.encode_pair_int("index", 0) + "," +
301
+ Tep::Json.encode_pair_str("text", piece) + "," +
302
+ "\"finish_reason\":null" +
303
+ "}]" +
304
+ "}"
305
+ @out.write("data: " + frame + "\n\n")
306
+ 0
307
+ end
308
+ end
309
+
310
+ # Runs one streaming completion. Subclass of Tep::Streamer so the
311
+ # server pumps `pump(out)` cooperatively; we own the SSE shape
312
+ # end-to-end: drive the backend through StreamSink, write the
313
+ # terminating data:[DONE], then emit the toy/v1 serving event
314
+ # (kind:eval, phase:serve, name:request) via Events#inference.
315
+ class CompletionsStreamer < Tep::Streamer
316
+ attr_accessor :model, :token_ids, :sampling
317
+ attr_accessor :prompt_tokens, :t0, :request_id, :principal_id
318
+
319
+ def initialize
320
+ @model = ""
321
+ @token_ids = [0]
322
+ @token_ids.delete_at(0)
323
+ @sampling = Tep::Llm::OpenAI::Sampling.new
324
+ @prompt_tokens = 0
325
+ @t0 = 0
326
+ @request_id = ""
327
+ @principal_id = ""
328
+ end
329
+
330
+ def pump(out)
331
+ sink = Tep::Llm::OpenAI::StreamSink.new
332
+ sink.out = out
333
+ sink.model = @model
334
+ Tep::APP.openai_backend.generate_stream_from_tokens(
335
+ @model, @token_ids, @sampling, sink)
336
+ # Terminating sentinel + inference event. wall_us is
337
+ # second-resolution for the same reason as the non-streaming
338
+ # path (spinel Time.now exposes epoch-int only); LLM is
339
+ # seconds-scale, populated wall_us is enough signal.
340
+ out.write("data: [DONE]\n\n")
341
+ wall_us = (Time.now.to_i - @t0) * 1_000_000
342
+ extra = "{" +
343
+ Tep::Json.encode_pair_str("request_id", @request_id) + "," +
344
+ Tep::Json.encode_pair_str("principal_id", @principal_id) +
345
+ "}"
346
+ Tep::APP.openai_events.inference(
347
+ @model, @prompt_tokens, sink.completion_count, wall_us, extra)
348
+ 0
349
+ end
350
+ end
351
+
352
+ # Chat-streaming write surface (#127). Three emit_* methods
353
+ # cover the OpenAI chat-streaming wire shape:
354
+ #
355
+ # 1. emit_role_prelude("assistant") -> first frame carries
356
+ # `delta:{role:"assistant"}` (no content).
357
+ # 2. emit_token(piece) -> N content frames, each
358
+ # `delta:{content:<piece>}` with finish_reason:null.
359
+ # 3. emit_finish("stop") -> last frame carries an empty
360
+ # `delta:{}` with finish_reason set; the streamer then
361
+ # writes the terminating data:[DONE].
362
+ #
363
+ # Backends typically: sink.emit_role_prelude("assistant"); then
364
+ # call sink.emit_token(piece) per generated token. emit_finish
365
+ # is invoked by the streamer after the backend returns -- not
366
+ # the backend's responsibility.
367
+ class ChatStreamSink
368
+ attr_accessor :out, :model, :completion_count
369
+
370
+ def initialize
371
+ @model = ""
372
+ @completion_count = 0
373
+ end
374
+
375
+ # First frame: role-only delta, no content. Per OpenAI's
376
+ # wire shape, sent once before content frames.
377
+ def emit_role_prelude(role)
378
+ frame = "{" +
379
+ Tep::Json.encode_pair_str("id", "chatcmpl-tep") + "," +
380
+ Tep::Json.encode_pair_str("object", "chat.completion.chunk") + "," +
381
+ Tep::Json.encode_pair_int("created", Time.now.to_i) + "," +
382
+ Tep::Json.encode_pair_str("model", @model) + "," +
383
+ "\"choices\":[{" +
384
+ Tep::Json.encode_pair_int("index", 0) + "," +
385
+ "\"delta\":{" +
386
+ Tep::Json.encode_pair_str("role", role) +
387
+ "}," +
388
+ "\"finish_reason\":null" +
389
+ "}]" +
390
+ "}"
391
+ @out.write("data: " + frame + "\n\n")
392
+ 0
393
+ end
394
+
395
+ # Content delta. One per generated token.
396
+ def emit_token(piece)
397
+ @completion_count = @completion_count + 1
398
+ frame = "{" +
399
+ Tep::Json.encode_pair_str("id", "chatcmpl-tep") + "," +
400
+ Tep::Json.encode_pair_str("object", "chat.completion.chunk") + "," +
401
+ Tep::Json.encode_pair_int("created", Time.now.to_i) + "," +
402
+ Tep::Json.encode_pair_str("model", @model) + "," +
403
+ "\"choices\":[{" +
404
+ Tep::Json.encode_pair_int("index", 0) + "," +
405
+ "\"delta\":{" +
406
+ Tep::Json.encode_pair_str("content", piece) +
407
+ "}," +
408
+ "\"finish_reason\":null" +
409
+ "}]" +
410
+ "}"
411
+ @out.write("data: " + frame + "\n\n")
412
+ 0
413
+ end
414
+
415
+ # Final frame: empty delta + populated finish_reason. The
416
+ # streamer writes data:[DONE] after this.
417
+ def emit_finish(reason)
418
+ frame = "{" +
419
+ Tep::Json.encode_pair_str("id", "chatcmpl-tep") + "," +
420
+ Tep::Json.encode_pair_str("object", "chat.completion.chunk") + "," +
421
+ Tep::Json.encode_pair_int("created", Time.now.to_i) + "," +
422
+ Tep::Json.encode_pair_str("model", @model) + "," +
423
+ "\"choices\":[{" +
424
+ Tep::Json.encode_pair_int("index", 0) + "," +
425
+ "\"delta\":{}," +
426
+ Tep::Json.encode_pair_str("finish_reason", reason) +
427
+ "}]" +
428
+ "}"
429
+ @out.write("data: " + frame + "\n\n")
430
+ 0
431
+ end
432
+ end
433
+
434
+ # Runs one streaming chat completion. Subclass of Tep::Streamer.
435
+ # Drives backend.chat_completion_stream through ChatStreamSink,
436
+ # writes the terminating data:[DONE], then emits the toy/v1
437
+ # serving event (kind:eval, phase:serve, name:request) with
438
+ # sink.completion_count (mirrors CompletionsStreamer's #128 shape).
439
+ class ChatCompletionsStreamer < Tep::Streamer
440
+ attr_accessor :req_ref, :model, :prompt_tokens
441
+ attr_accessor :t0, :request_id, :principal_id
442
+
443
+ def initialize
444
+ @req_ref = Tep::Request.new
445
+ @model = ""
446
+ @prompt_tokens = 0
447
+ @t0 = 0
448
+ @request_id = ""
449
+ @principal_id = ""
450
+ end
451
+
452
+ def pump(out)
453
+ sink = Tep::Llm::OpenAI::ChatStreamSink.new
454
+ sink.out = out
455
+ sink.model = @model
456
+ sink.emit_role_prelude("assistant")
457
+ Tep::APP.openai_backend.chat_completion_stream(@req_ref, sink)
458
+ sink.emit_finish("stop")
459
+ out.write("data: [DONE]\n\n")
460
+ wall_us = (Time.now.to_i - @t0) * 1_000_000
461
+ extra = "{" +
462
+ Tep::Json.encode_pair_str("request_id", @request_id) + "," +
463
+ Tep::Json.encode_pair_str("principal_id", @principal_id) +
464
+ "}"
465
+ Tep::APP.openai_events.inference(
466
+ @model, @prompt_tokens, sink.completion_count, wall_us, extra)
467
+ 0
468
+ end
469
+ end
470
+
471
+ # GET /v1/models -- the standard OpenAI list envelope, built from
472
+ # backend.list_models. Dispatches through APP.openai_backend so
473
+ # the app's subclass override is what answers.
474
+ class ModelsHandler < Tep::Handler
475
+ def handle(req, res)
476
+ res.headers["Content-Type"] = "application/json"
477
+ models = Tep::APP.openai_backend.list_models
478
+ out = "{\"object\":\"list\",\"data\":["
479
+ i = 0
480
+ while i < models.length
481
+ if i > 0
482
+ out = out + ","
483
+ end
484
+ out = out + "{" +
485
+ Tep::Json.encode_pair_str("id", models[i]) + "," +
486
+ Tep::Json.encode_pair_str("object", "model") + "," +
487
+ Tep::Json.encode_pair_str("owned_by", "tep") +
488
+ "}"
489
+ i += 1
490
+ end
491
+ out = out + "]}"
492
+ out
493
+ end
494
+ end
495
+
496
+ # POST /v1/completions -- token-level OpenAI shape (the primary
497
+ # completion route). Parses model / prompt (token ids) /
498
+ # max_tokens, calls backend.generate_from_tokens, and formats the
499
+ # standard text_completion response. Dispatches through
500
+ # APP.openai_backend (the app's subclass override answers).
501
+ class CompletionsHandler < Tep::Handler
502
+ def handle(req, res)
503
+ body = req.raw_body
504
+ model = Tep::Json.get_str(body, "model")
505
+ token_ids = Tep::Json.get_int_array(body, "prompt")
506
+ sampling = Tep::Llm::OpenAI::Sampling.new
507
+ sampling.max_tokens = Tep::Json.get_int(body, "max_tokens")
508
+ # Floats from the JSON body; defaults stay at 1.0 if the
509
+ # key is absent (Tep::Json.get_float returns 0.0 for
510
+ # missing, but we only overwrite when present).
511
+ if Tep::Json.has_key?(body, "temperature")
512
+ sampling.temperature = Tep::Json.get_float(body, "temperature")
513
+ end
514
+ if Tep::Json.has_key?(body, "top_p")
515
+ sampling.top_p = Tep::Json.get_float(body, "top_p")
516
+ end
517
+
518
+ # OpenAI signals streaming with "stream": true in the JSON
519
+ # body; Tep::Json has no bool getter, so we sniff the literal
520
+ # (same shape as examples/llm_gateway/app.rb). When set, the
521
+ # response is SSE: a CompletionsStreamer pumps per-token
522
+ # frames + the [DONE] sentinel, then emits the inference
523
+ # event with sink.completion_count.
524
+ wants_stream = Tep.str_find(body, "\"stream\":true", 0) >= 0 ||
525
+ Tep.str_find(body, "\"stream\": true", 0) >= 0
526
+ if wants_stream
527
+ res.headers["Content-Type"] = "text/event-stream"
528
+ res.headers["Cache-Control"] = "no-cache"
529
+ streamer = Tep::Llm::OpenAI::CompletionsStreamer.new
530
+ streamer.model = model
531
+ streamer.token_ids = token_ids
532
+ streamer.sampling = sampling
533
+ streamer.prompt_tokens = token_ids.length
534
+ streamer.t0 = Time.now.to_i
535
+ streamer.request_id = "cmpl-tep"
536
+ streamer.principal_id = req.identity.subject
537
+ res.start_stream(streamer)
538
+ return ""
539
+ end
540
+
541
+ res.headers["Content-Type"] = "application/json"
542
+
543
+ # Stamp t0 for the inference event's wall_us. Time.now exposes
544
+ # only integer epoch seconds under spinel, so wall_us is at
545
+ # second-resolution (latency * 1_000_000) -- coarse, but LLM
546
+ # serving is seconds-scale, fine for the run-level analytics.
547
+ # A µs clock helper lands later; until then this is the right
548
+ # placeholder shape so consumers see populated wall_us.
549
+ t0 = Time.now.to_i
550
+
551
+ comp = Tep::APP.openai_backend.generate_from_tokens(model, token_ids, sampling)
552
+ total = comp.prompt_tokens + comp.completion_tokens
553
+
554
+ # Emit one inference event per request. Skipped when events
555
+ # are disabled via path-length short-circuit inside #inference.
556
+ # request_id matches the JSON response's id; principal_id is
557
+ # the auth-filter populated identity (anonymous if none).
558
+ wall_us = (Time.now.to_i - t0) * 1_000_000
559
+ extra = "{" +
560
+ Tep::Json.encode_pair_str("request_id", "cmpl-tep") + "," +
561
+ Tep::Json.encode_pair_str("principal_id", req.identity.subject) +
562
+ "}"
563
+ Tep::APP.openai_events.inference(
564
+ model, comp.prompt_tokens, comp.completion_tokens, wall_us, extra
565
+ )
566
+
567
+ "{" +
568
+ Tep::Json.encode_pair_str("id", "cmpl-tep") + "," +
569
+ Tep::Json.encode_pair_str("object", "text_completion") + "," +
570
+ Tep::Json.encode_pair_int("created", Time.now.to_i) + "," +
571
+ Tep::Json.encode_pair_str("model", model) + "," +
572
+ "\"choices\":[{" +
573
+ Tep::Json.encode_pair_int("index", 0) + "," +
574
+ Tep::Json.encode_pair_str("text", comp.text) + "," +
575
+ Tep::Json.encode_pair_str("finish_reason", "stop") +
576
+ "}]," +
577
+ "\"usage\":{" +
578
+ Tep::Json.encode_pair_int("prompt_tokens", comp.prompt_tokens) + "," +
579
+ Tep::Json.encode_pair_int("completion_tokens", comp.completion_tokens) + "," +
580
+ Tep::Json.encode_pair_int("total_tokens", total) +
581
+ "}" +
582
+ "}"
583
+ end
584
+ end
585
+
586
+ # POST /v1/chat/completions -- message-level OpenAI shape. Skeleton
587
+ # for now: gated 501 when backend.supports_chat? is false (the
588
+ # default; chat templating is per-model + an ML concern tep
589
+ # doesn't ship). When a backend opts in (overrides supports_chat?
590
+ # to true + chat_completion), this dispatches to it and formats
591
+ # the standard chat.completion envelope around the returned
592
+ # Completion (the text field becomes the assistant message's
593
+ # content). Streaming chat lands later.
594
+ class ChatCompletionsHandler < Tep::Handler
595
+ def handle(req, res)
596
+ res.headers["Content-Type"] = "application/json"
597
+ if !Tep::APP.openai_backend.supports_chat?
598
+ res.set_status(501)
599
+ return "{" +
600
+ "\"error\":{" +
601
+ Tep::Json.encode_pair_str("message",
602
+ "chat completions not supported by this backend") + "," +
603
+ Tep::Json.encode_pair_str("type", "not_implemented") +
604
+ "}" +
605
+ "}"
606
+ end
607
+ body = req.raw_body
608
+ model = Tep::Json.get_str(body, "model")
609
+
610
+ # Streaming branch (#127): same "stream":true sniff as
611
+ # CompletionsHandler. Sends an SSE response driven by
612
+ # ChatCompletionsStreamer -- which calls into
613
+ # backend.chat_completion_stream via a ChatStreamSink.
614
+ wants_stream = Tep.str_find(body, "\"stream\":true", 0) >= 0 ||
615
+ Tep.str_find(body, "\"stream\": true", 0) >= 0
616
+ if wants_stream
617
+ res.headers["Content-Type"] = "text/event-stream"
618
+ res.headers["Cache-Control"] = "no-cache"
619
+ streamer = Tep::Llm::OpenAI::ChatCompletionsStreamer.new
620
+ streamer.req_ref = req
621
+ streamer.model = model
622
+ # No `prompt` token-id array on chat requests; pass 0 so
623
+ # the inference event has a deterministic value. A future
624
+ # refinement can derive prompt_tokens from the messages
625
+ # array's byte length / tokenizer estimate.
626
+ streamer.prompt_tokens = 0
627
+ streamer.t0 = Time.now.to_i
628
+ streamer.request_id = "chatcmpl-tep"
629
+ streamer.principal_id = req.identity.subject
630
+ res.start_stream(streamer)
631
+ return ""
632
+ end
633
+
634
+ comp = Tep::APP.openai_backend.chat_completion(req)
635
+ total = comp.prompt_tokens + comp.completion_tokens
636
+ "{" +
637
+ Tep::Json.encode_pair_str("id", "chatcmpl-tep") + "," +
638
+ Tep::Json.encode_pair_str("object", "chat.completion") + "," +
639
+ Tep::Json.encode_pair_int("created", Time.now.to_i) + "," +
640
+ Tep::Json.encode_pair_str("model", model) + "," +
641
+ "\"choices\":[{" +
642
+ Tep::Json.encode_pair_int("index", 0) + "," +
643
+ "\"message\":{" +
644
+ Tep::Json.encode_pair_str("role", "assistant") + "," +
645
+ Tep::Json.encode_pair_str("content", comp.text) +
646
+ "}," +
647
+ Tep::Json.encode_pair_str("finish_reason", "stop") +
648
+ "}]," +
649
+ "\"usage\":{" +
650
+ Tep::Json.encode_pair_int("prompt_tokens", comp.prompt_tokens) + "," +
651
+ Tep::Json.encode_pair_int("completion_tokens", comp.completion_tokens) + "," +
652
+ Tep::Json.encode_pair_int("total_tokens", total) +
653
+ "}" +
654
+ "}"
655
+ end
656
+ end
657
+
658
+ # POST /v1/embeddings -- OpenAI embeddings shape. Gated 501 when
659
+ # backend.supports_embeddings? is false (the default). When a
660
+ # backend opts in, parses the IDs-only `input` array, asks the
661
+ # backend for the pooled vector, and formats the standard
662
+ # embeddings envelope. Mirrors toy's mean-pooled handler -- the
663
+ # pooling strategy lives in the backend, not here.
664
+ class EmbeddingsHandler < Tep::Handler
665
+ def handle(req, res)
666
+ res.headers["Content-Type"] = "application/json"
667
+ if !Tep::APP.openai_backend.supports_embeddings?
668
+ res.set_status(501)
669
+ return "{" +
670
+ "\"error\":{" +
671
+ Tep::Json.encode_pair_str("message",
672
+ "embeddings not supported by this backend") + "," +
673
+ Tep::Json.encode_pair_str("type", "not_implemented") +
674
+ "}" +
675
+ "}"
676
+ end
677
+ body = req.raw_body
678
+ model = Tep::Json.get_str(body, "model")
679
+ ids = Tep::Json.get_int_array(body, "input")
680
+ if ids.length == 0
681
+ res.set_status(400)
682
+ return "{" +
683
+ "\"error\":{" +
684
+ Tep::Json.encode_pair_str("message",
685
+ "input must be a non-empty integer array " +
686
+ "(this server speaks token IDs only; tokenize client-side)") + "," +
687
+ Tep::Json.encode_pair_str("type", "invalid_request_error") +
688
+ "}" +
689
+ "}"
690
+ end
691
+
692
+ vec = Tep::APP.openai_backend.generate_embeddings(model, ids)
693
+
694
+ # Build the embedding float array by hand: Tep::Json has no
695
+ # float-array encoder, and Float#to_s yields a JSON number.
696
+ emb = "["
697
+ k = 0
698
+ while k < vec.length
699
+ if k > 0
700
+ emb = emb + ","
701
+ end
702
+ emb = emb + vec[k].to_s
703
+ k = k + 1
704
+ end
705
+ emb = emb + "]"
706
+
707
+ n = ids.length
708
+ "{" +
709
+ Tep::Json.encode_pair_str("object", "list") + "," +
710
+ "\"data\":[{" +
711
+ Tep::Json.encode_pair_str("object", "embedding") + "," +
712
+ Tep::Json.encode_pair_int("index", 0) + "," +
713
+ "\"embedding\":" + emb +
714
+ "}]," +
715
+ Tep::Json.encode_pair_str("model", model) + "," +
716
+ "\"usage\":{" +
717
+ Tep::Json.encode_pair_int("prompt_tokens", n) + "," +
718
+ Tep::Json.encode_pair_int("total_tokens", n) +
719
+ "}" +
720
+ "}"
721
+ end
722
+ end
723
+ end
724
+ end
725
+ end