revund-ruby-worker 0.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,114 @@
1
+ # frozen_string_literal: true
2
+
3
+ require_relative 'parser'
4
+ require_relative 'fetcher'
5
+ # Generated by grpc_tools_ruby_protoc from
6
+ # /proto/worker/v1/worker.proto. The codegen script lives at
7
+ # scripts/gen-proto.sh (a follow-up). Uncomment once generated:
8
+ #
9
+ # require 'worker/v1/worker_pb'
10
+ # require 'worker/v1/worker_services_pb'
11
+
12
+ module RubyWorker
13
+ # Service implements the universal `revund.worker.v1.Worker`
14
+ # contract — the same contract ts-worker and php-worker speak.
15
+ #
16
+ # Each handler is thin — it translates between the gRPC wire
17
+ # shape and the Parser domain object. The handler bodies are
18
+ # written against the assumed generated message shape so
19
+ # swapping the stub in is mechanical.
20
+ class Service
21
+ # include ::Revund::Worker::V1::Worker::Service
22
+
23
+ NAME = 'ruby-worker'
24
+ LANGUAGES = ['ruby'].freeze
25
+ CAPABILITIES = %w[parse self_fetch].freeze
26
+
27
+ # AUTH_HEADER mirrors the Go-side constant in
28
+ # core/pkg/worker/auth.go. The bot stamps it on every
29
+ # outbound RPC; this worker rejects requests without a
30
+ # matching value when REVUND_WORKER_SECRET is configured.
31
+ AUTH_HEADER = 'x-revund-worker-token'
32
+ AUTH_SECRET_ENV = 'REVUND_WORKER_SECRET'
33
+
34
+ def initialize
35
+ @parser = RubyWorker::Parser.new
36
+ end
37
+
38
+ # Describe — self-identifies. The bot calls this on first
39
+ # connect to learn what languages + capabilities this worker
40
+ # advertises.
41
+ def describe(_request, call)
42
+ return unauthenticated!(call) unless authorized?(call)
43
+
44
+ response_class = ::Revund::Worker::V1::DescribeResponse rescue nil
45
+ return nil if response_class.nil?
46
+
47
+ response_class.new(
48
+ name: NAME,
49
+ version: Server::VERSION,
50
+ languages: LANGUAGES,
51
+ capabilities: CAPABILITIES,
52
+ )
53
+ end
54
+
55
+ def health(_request, call)
56
+ return unauthenticated!(call) unless authorized?(call)
57
+
58
+ response_class = ::Revund::Worker::V1::HealthResponse rescue nil
59
+ return nil if response_class.nil?
60
+
61
+ response_class.new(version: Server::VERSION)
62
+ end
63
+
64
+ def parse(request, call)
65
+ return unauthenticated!(call) unless authorized?(call)
66
+
67
+ response_class = ::Revund::Worker::V1::ParseResponse rescue nil
68
+ return nil if response_class.nil?
69
+
70
+ repo_path = resolve_repo_path(request)
71
+ parsed = @parser.parse_files(repo_path, request.files.to_a)
72
+ response_class.new(files: parsed)
73
+ end
74
+
75
+ private
76
+
77
+ # Validate the bearer header against REVUND_WORKER_SECRET.
78
+ # Empty / unset secret = no enforcement (CLI / local-dev
79
+ # default). Returns true on success or "no enforcement."
80
+ def authorized?(call)
81
+ expected = ENV[AUTH_SECRET_ENV].to_s
82
+ return true if expected.empty?
83
+
84
+ md = call.respond_to?(:metadata) ? (call.metadata || {}) : {}
85
+ Array(md[AUTH_HEADER]).first.to_s == expected
86
+ end
87
+
88
+ def unauthenticated!(_call)
89
+ raise GRPC::Unauthenticated, 'missing or invalid x-revund-worker-token'
90
+ end
91
+
92
+ # Two dispatch modes:
93
+ # - shared-FS path mode: returns request.repo_path verbatim
94
+ # - self-fetch mode: hands the RepoSource to Fetcher, returns
95
+ # the local cached checkout path.
96
+ def resolve_repo_path(request)
97
+ src = request.respond_to?(:repo_source) ? request.repo_source : nil
98
+ if src && !src.url.to_s.empty?
99
+ return Fetcher.fetch_or_cache(
100
+ url: src.url,
101
+ ref: src.ref,
102
+ auth_token: src.auth_token,
103
+ auth_user: src.respond_to?(:auth_user) ? src.auth_user : '',
104
+ )
105
+ end
106
+ request.repo_path
107
+ end
108
+
109
+ # ResolveSymbols and RunDiagnostics are intentionally not
110
+ # implemented. The bot's caller checks the `capabilities`
111
+ # list from Describe and skips RPCs the worker hasn't
112
+ # advertised — so unimplemented = silently skipped.
113
+ end
114
+ end
@@ -0,0 +1,5 @@
1
+ # frozen_string_literal: true
2
+
3
+ module RubyWorker
4
+ VERSION = '0.1.0'
5
+ end
@@ -0,0 +1,480 @@
1
+ // worker.proto — the universal AST-worker contract.
2
+ //
3
+ // # Public API
4
+ //
5
+ // This file defines the wire contract every Revund AST
6
+ // sidecar speaks. It is designed for STABILITY since the
7
+ // contract may eventually be published as an open-source
8
+ // interface that third-party language sidecars target.
9
+ //
10
+ // Compatibility rules:
11
+ //
12
+ // - Adding fields to existing messages: ALLOWED (proto3
13
+ // ignores unknown fields gracefully).
14
+ // - Adding new RPCs: ALLOWED (clients negotiate capability
15
+ // via Describe).
16
+ // - Removing or renaming fields: BREAKING — bump the
17
+ // package version (v1 → v2). Old clients keep speaking v1.
18
+ // - Changing field types: BREAKING.
19
+ //
20
+ // # Sidecar contract
21
+ //
22
+ // A Revund worker is any process implementing this service.
23
+ // It can be:
24
+ //
25
+ // - A first-party reference implementation we maintain
26
+ // (ts-worker, php-worker, ruby-worker)
27
+ // - A community-built sidecar for any language
28
+ // - A proprietary worker a customer builds for their
29
+ // in-house DSL
30
+ //
31
+ // The bot dials the worker by host:port, calls `Describe`
32
+ // to learn which languages + capabilities it advertises,
33
+ // and routes Parse RPCs based on the response. The bot
34
+ // does NOT hardcode language-to-worker mappings — every
35
+ // worker self-identifies.
36
+ //
37
+ // # Minimum viable worker
38
+ //
39
+ // Implement only `Describe`, `Health`, and `Parse`. Set
40
+ // `capabilities = ["parse"]`. The bot uses that worker for
41
+ // its advertised languages and skips the rest gracefully.
42
+ //
43
+ // # Symbol resolution / diagnostics (optional)
44
+ //
45
+ // Capabilities are advisory. Workers that ALSO support
46
+ // symbol resolution (returning declarations of identifiers
47
+ // referenced in changed code) advertise
48
+ // `capabilities = ["parse", "resolve_symbols"]`. Workers
49
+ // that run language-specific type-checkers (TypeScript's
50
+ // `tsc --noEmit`, PHP's PHPStan, Ruby's Sorbet) advertise
51
+ // `["parse", "diagnostics"]`. The bot uses these features
52
+ // when present, falls back to "just parse" when not.
53
+
54
+ syntax = "proto3";
55
+
56
+ package revund.worker.v1;
57
+
58
+ option go_package = "github.com/revund-dev/revund/core/pkg/worker/proto/worker/v1;workerpb";
59
+
60
+ service Worker {
61
+ // Describe identifies the worker. The bot calls this
62
+ // first to learn what languages + capabilities it
63
+ // supports. Idempotent, side-effect-free.
64
+ rpc Describe(DescribeRequest) returns (DescribeResponse);
65
+
66
+ // Health is a standard k8s-style liveness probe.
67
+ rpc Health(HealthRequest) returns (HealthResponse);
68
+
69
+ // Parse returns a minimal AST view (imports + decls +
70
+ // functions + concerns) for the requested files. Per-
71
+ // file parse errors are returned INSIDE the response,
72
+ // never as RPC errors. REQUIRED capability — every
73
+ // healthy worker implements this.
74
+ rpc Parse(ParseRequest) returns (ParseResponse);
75
+
76
+ // ResolveSymbols returns the declarations of identifiers
77
+ // referenced in a diff but defined elsewhere in the
78
+ // repo. Used by the bot's ingest layer to enrich the
79
+ // LLM bundle with cross-file type information.
80
+ //
81
+ // OPTIONAL capability — workers advertise
82
+ // "resolve_symbols" in Describe.Capabilities when
83
+ // implemented. Workers that don't implement it return
84
+ // UNIMPLEMENTED; the bot's caller checks the capability
85
+ // list and skips the call when unsupported.
86
+ rpc ResolveSymbols(ResolveRequest) returns (ResolveResponse);
87
+
88
+ // RunDiagnostics runs the language's native type-checker
89
+ // (tsc --noEmit for TypeScript, PHPStan for PHP, Sorbet
90
+ // for Ruby, etc.) and returns errors touching the
91
+ // changed files. Empty when the project has no errors
92
+ // or the worker doesn't run a type-checker — never an
93
+ // RPC error.
94
+ //
95
+ // OPTIONAL capability — workers advertise "diagnostics"
96
+ // in Describe.Capabilities when implemented.
97
+ rpc RunDiagnostics(DiagnosticsRequest) returns (DiagnosticsResponse);
98
+ }
99
+
100
+ // ─────────────────────────────────────────────────────────
101
+ // Describe — self-identification
102
+ // ─────────────────────────────────────────────────────────
103
+
104
+ message DescribeRequest {}
105
+
106
+ message DescribeResponse {
107
+ // Name is a human-readable identifier for this worker,
108
+ // e.g. "ts-worker", "php-worker", "ruby-worker", or a
109
+ // community-chosen name like "my-fancy-go-worker".
110
+ // Used for logging and observability; NOT used for
111
+ // routing (languages field does that).
112
+ string name = 1;
113
+
114
+ // Version is the worker's semantic version. Logged at
115
+ // startup and surfaced in debug bundles so we can tie
116
+ // findings to a specific worker build.
117
+ string version = 2;
118
+
119
+ // Languages the worker can parse. Lowercase canonical
120
+ // names matching the lang.Language enum on the Go side:
121
+ // "typescript", "javascript", "go", "php", "ruby",
122
+ // "python", "rust", "java", "kotlin", "swift", "csharp".
123
+ //
124
+ // A worker MAY advertise multiple languages (a tree-
125
+ // sitter-based worker might handle ten). The bot routes
126
+ // each language's files to the worker that advertises
127
+ // it; on conflict (multiple workers advertise the same
128
+ // language), the bot picks the first one registered.
129
+ repeated string languages = 3;
130
+
131
+ // Capabilities is the set of optional features this
132
+ // worker implements beyond the required Parse RPC.
133
+ // Known values:
134
+ //
135
+ // "parse" — Parse RPC (REQUIRED; always
136
+ // present in a healthy worker)
137
+ // "resolve_symbols" — symbol-decl resolution; the
138
+ // worker can return declarations
139
+ // of identifiers referenced in
140
+ // changed code.
141
+ // "diagnostics" — language-specific type-check
142
+ // diagnostics (tsc, PHPStan,
143
+ // Sorbet, etc.).
144
+ // "self_fetch" — worker can clone the repo
145
+ // itself given a RepoSource
146
+ // (url + ref + auth_token) on
147
+ // the request. Required for
148
+ // cross-host deployments where
149
+ // the bot and worker do NOT
150
+ // share a filesystem. When this
151
+ // capability is advertised, the
152
+ // bot sends `repo_source`
153
+ // instead of `repo_path` on
154
+ // Parse / ResolveSymbols /
155
+ // RunDiagnostics RPCs.
156
+ //
157
+ // New capabilities can be added without breaking older
158
+ // bots — they advertise but the bot ignores capabilities
159
+ // it doesn't recognize.
160
+ repeated string capabilities = 4;
161
+ }
162
+
163
+ // RepoSource tells a self-fetching worker how to obtain
164
+ // the repo without depending on a shared filesystem with
165
+ // the bot. The worker shallow-clones via git using these
166
+ // values, caches the clone for the review's other RPCs,
167
+ // and evicts on idle.
168
+ //
169
+ // Capability gate: workers MUST advertise "self_fetch"
170
+ // in Describe.Capabilities before the bot will send
171
+ // `repo_source`. Workers without that capability ignore
172
+ // the field entirely and use `repo_path`.
173
+ //
174
+ // # Security expectations on workers
175
+ //
176
+ // - Use auth_token ONLY at clone time (as the password
177
+ // in https://x-access-token:<TOKEN>@host/path.git).
178
+ // - Immediately after a successful clone, strip the
179
+ // token from the remote URL stored in .git/config.
180
+ // - Never log the token. Sanitize error messages that
181
+ // might include the URL.
182
+ // - Never persist the token to disk in any form.
183
+ //
184
+ // Tokens are short-lived (typically 1 hour for GitHub
185
+ // installation tokens) and scoped to the source repo's
186
+ // permissions, so blast radius is bounded — but the
187
+ // hygiene rules above still apply.
188
+ message RepoSource {
189
+ // url is the https clone URL, e.g.
190
+ // "https://github.com/owner/repo.git" — no userinfo.
191
+ // SSH URLs are not supported here — the bot's auth
192
+ // model is bearer-token-over-https.
193
+ string url = 1;
194
+
195
+ // ref is the commit SHA (preferred) or branch / tag
196
+ // name to check out. Commit SHA is deterministic and
197
+ // matches what GitHub / GitLab webhooks already give
198
+ // us; branch/tag names are accepted for the local-dev
199
+ // case but workers should warn when they receive one
200
+ // (drift risk).
201
+ string ref = 2;
202
+
203
+ // auth_token is the bearer credential the worker uses
204
+ // as the password in the clone URL. Typically a
205
+ // platform installation access token (1h TTL).
206
+ string auth_token = 3;
207
+
208
+ // auth_user is the basic-auth username the worker
209
+ // pairs with auth_token to compose the clone URL:
210
+ //
211
+ // https://<auth_user>:<auth_token>@<host>/<path>
212
+ //
213
+ // Defaults to "x-access-token" when empty — that's
214
+ // GitHub's convention and works for the majority of
215
+ // first-party deployments. Other platforms set this
216
+ // explicitly:
217
+ //
218
+ // GitHub → "x-access-token" (or empty)
219
+ // GitLab → "oauth2"
220
+ // Bitbucket → "x-token-auth"
221
+ //
222
+ // Adding a new platform = the bot sets a new
223
+ // auth_user string; workers don't need to know which
224
+ // platform they're talking to.
225
+ string auth_user = 4;
226
+ }
227
+
228
+ message HealthRequest {}
229
+ message HealthResponse {
230
+ string version = 1;
231
+ }
232
+
233
+ // ─────────────────────────────────────────────────────────
234
+ // Parse — the core AST RPC
235
+ // ─────────────────────────────────────────────────────────
236
+
237
+ message ParseRequest {
238
+ // Absolute path to the repo root. The worker resolves
239
+ // file paths relative to this root.
240
+ //
241
+ // Exactly one of repo_path / repo_source is populated
242
+ // per request. repo_path is set when the bot and
243
+ // worker share a filesystem (CLI / local sidecar /
244
+ // pod-with-shared-volume). repo_source is set when
245
+ // the worker advertises "self_fetch" and the bot is
246
+ // dialing across the network.
247
+ string repo_path = 1;
248
+
249
+ // Repo-relative paths to parse. Slash-normalized,
250
+ // forward-slash separator regardless of platform.
251
+ repeated string files = 2;
252
+
253
+ // Self-fetch source — populated when the bot wants
254
+ // the worker to clone the repo itself. See RepoSource
255
+ // for the capability gate and security expectations.
256
+ RepoSource repo_source = 3;
257
+ }
258
+
259
+ message ParseResponse {
260
+ repeated ParsedFile files = 1;
261
+ }
262
+
263
+ // ParsedFile carries the minimal AST surface every
264
+ // structural detector consumes. Shape is universal across
265
+ // languages — implementations populate the fields their
266
+ // language meaningfully exposes and leave the rest empty.
267
+ message ParsedFile {
268
+ // Repo-relative path, slash-normalized. Echoed back from
269
+ // the request so the bot can correlate response files
270
+ // with request files without ordering assumptions.
271
+ string path = 1;
272
+
273
+ // The language tag the worker assigned to this file.
274
+ // Usually matches one of the languages from the worker's
275
+ // Describe response. Set even on parse errors.
276
+ string language = 2;
277
+
278
+ repeated ImportRef imports = 3;
279
+ repeated DeclRef decls = 4;
280
+ repeated FunctionRef functions = 5;
281
+ repeated ConcernEvidenceRef concerns = 7;
282
+
283
+ // Non-empty when the worker failed to parse this file
284
+ // cleanly. The other fields may still carry partial
285
+ // results — workers should return whatever the parser
286
+ // salvaged so detectors get USEFUL signal even on
287
+ // syntactically-broken files.
288
+ string parse_error = 6;
289
+ }
290
+
291
+ // ImportRef is one import / require / use declaration.
292
+ message ImportRef {
293
+ // Raw module identifier as written in source. The bot
294
+ // does NOT resolve this against the language's module
295
+ // system; that's the worker's job if it advertises
296
+ // resolve_symbols.
297
+ string path = 1;
298
+
299
+ // Local binding when the language allows renaming on
300
+ // import (Go: `import f "fmt"`, JS: `import { x as y }`,
301
+ // PHP: `use Foo\Bar as B`, Ruby: autoload). Empty when
302
+ // not applicable or not aliased.
303
+ string alias = 2;
304
+
305
+ // 1-based line number in the source file.
306
+ int32 line = 3;
307
+ }
308
+
309
+ // DeclRef is one top-level declaration. Kind is a lowercase
310
+ // string from the canonical set: function | method | type |
311
+ // interface | const | var | class | trait | enum | module |
312
+ // constant | component | hook. Workers MAY advertise
313
+ // kinds not in this list; the bot's structural detectors
314
+ // switch on the canonical set and silently ignore unknowns
315
+ // (forward-compat).
316
+ message DeclRef {
317
+ string name = 1;
318
+ string kind = 2;
319
+ int32 line = 3;
320
+ int32 end_line = 4;
321
+
322
+ // Exported is whether this declaration is visible
323
+ // outside its compilation unit / module / namespace.
324
+ // Languages without formal export concepts (PHP, Ruby)
325
+ // set this to true for top-level decls.
326
+ bool exported = 5;
327
+ }
328
+
329
+ // FunctionRef describes one function / method definition.
330
+ // Carries the data the structural detectors need for
331
+ // god-function, complexity, and the three DRY variants.
332
+ message FunctionRef {
333
+ string name = 1;
334
+ int32 start_line = 2;
335
+ int32 end_line = 3;
336
+
337
+ // Cyclomatic complexity estimate (McCabe — count of
338
+ // decision points + 1). Workers may compute it however
339
+ // they prefer; the bot uses the value as a coarse
340
+ // signal, not a precise measurement.
341
+ int32 complexity = 4;
342
+
343
+ bool is_method = 5;
344
+ bool is_exported = 6;
345
+
346
+ // Hash is the LANGUAGE-SPECIFIC fingerprint of the
347
+ // function body's AST shape. Two functions in the same
348
+ // language with the same hash share structure modulo
349
+ // identifier names + literal values. Empty when the
350
+ // body was too small to be a meaningful DRY signal.
351
+ //
352
+ // The hashing scheme is per-worker. Recommended: SHA-1
353
+ // of a token stream, truncated to 16 hex chars. The
354
+ // ts-worker reference implementation in
355
+ // workers/ts/src/parser.ts is the template for new
356
+ // workers.
357
+ string hash = 7;
358
+
359
+ // CanonicalHash is the LANGUAGE-NEUTRAL fingerprint
360
+ // produced by mapping the function body's AST onto a
361
+ // shared canonical token vocabulary. Two functions in
362
+ // DIFFERENT languages with the same canonical hash share
363
+ // a structural shape — the basis for cross-language
364
+ // duplicate detection.
365
+ //
366
+ // The canonical vocabulary is defined in
367
+ // core/pkg/structural/lang/canonical.go. New workers
368
+ // mirror that vocabulary; PR-able if a new construct
369
+ // needs adding.
370
+ string canonical_hash = 8;
371
+
372
+ repeated BlockRef blocks = 9;
373
+ }
374
+
375
+ // BlockRef is one nested statement block (if-body, else-
376
+ // body, for-body, switch-case body, try / catch / finally
377
+ // body). Each block carries both hash variants so within-
378
+ // language and cross-language block-duplicate detection
379
+ // consume the same data.
380
+ message BlockRef {
381
+ // Kind of the construct that owns this block:
382
+ // "if" | "else" | "elseif" | "for" | "case" |
383
+ // "try" | "catch" | "finally" | "rescue" | "block".
384
+ string kind = 1;
385
+
386
+ int32 start_line = 2;
387
+ int32 end_line = 3;
388
+ string hash = 4; // language-specific
389
+ string canonical_hash = 5; // language-neutral
390
+ }
391
+
392
+ // ConcernEvidenceRef is one categorized signal the worker
393
+ // extracted. The bot demuxes by category into the typed
394
+ // ConcernSet on lang.FileView.
395
+ //
396
+ // Canonical category values:
397
+ // "presentation" — UI rendering / templates / JSX
398
+ // "state" — in-memory state, hooks, signals
399
+ // "transport" — server-side request handlers
400
+ // "network" — outbound HTTP / RPC / message-broker
401
+ // "dataaccess" — persistent storage operations
402
+ // "io" — filesystem, OS-level IO
403
+ // "config" — env reads, flags, dotenv
404
+ // "business" — high-complexity decision logic
405
+ //
406
+ // New categories can be added by workers; the bot
407
+ // silently drops unknown values so older bots keep working.
408
+ message ConcernEvidenceRef {
409
+ string category = 1;
410
+ int32 line = 2;
411
+ string symbol = 3;
412
+ string note = 4;
413
+ }
414
+
415
+ // ─────────────────────────────────────────────────────────
416
+ // ResolveSymbols — optional capability
417
+ // ─────────────────────────────────────────────────────────
418
+
419
+ message ResolveRequest {
420
+ // Absolute repo root path. See ParseRequest.repo_path
421
+ // for the repo_path / repo_source contract.
422
+ string repo_path = 1;
423
+
424
+ // The unified diff being reviewed. The worker extracts
425
+ // referenced-but-undeclared identifiers from the diff
426
+ // and looks up their declarations.
427
+ string diff = 2;
428
+
429
+ // Repo-relative paths of changed files. Authoritative
430
+ // when present; the worker may also derive its own list
431
+ // from the diff.
432
+ repeated string changed_files = 3;
433
+
434
+ // Self-fetch source — set when the worker should clone
435
+ // the repo itself instead of reading from repo_path.
436
+ RepoSource repo_source = 4;
437
+ }
438
+
439
+ message ResolveResponse {
440
+ repeated SymbolDecl symbols = 1;
441
+ }
442
+
443
+ message SymbolDecl {
444
+ string name = 1;
445
+ string file_path = 2; // repo-relative
446
+ int32 start_line = 3;
447
+ int32 end_line = 4;
448
+ string text = 5; // full declaration source
449
+ }
450
+
451
+ // ─────────────────────────────────────────────────────────
452
+ // RunDiagnostics — optional capability
453
+ // ─────────────────────────────────────────────────────────
454
+
455
+ message DiagnosticsRequest {
456
+ // See ParseRequest.repo_path for the repo_path /
457
+ // repo_source contract.
458
+ string repo_path = 1;
459
+
460
+ // Only return diagnostics whose file is in this set.
461
+ // Empty = no filter (return everything tsc / PHPStan /
462
+ // Sorbet found).
463
+ repeated string filter_files = 2;
464
+
465
+ // Self-fetch source — set when the worker should clone
466
+ // the repo itself instead of reading from repo_path.
467
+ RepoSource repo_source = 3;
468
+ }
469
+
470
+ message DiagnosticsResponse {
471
+ repeated Diagnostic diagnostics = 1;
472
+ }
473
+
474
+ message Diagnostic {
475
+ string file = 1; // repo-relative
476
+ int32 line = 2;
477
+ int32 col = 3;
478
+ string code = 4; // e.g. "TS2345", "PHPStan.Error", "Sorbet:7008"
479
+ string message = 5;
480
+ }