tree_haver 1.0.0 → 3.0.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- checksums.yaml.gz.sig +0 -0
- data/CHANGELOG.md +236 -3
- data/CONTRIBUTING.md +100 -0
- data/README.md +470 -85
- data/lib/tree_haver/backends/citrus.rb +423 -0
- data/lib/tree_haver/backends/ffi.rb +405 -150
- data/lib/tree_haver/backends/java.rb +63 -10
- data/lib/tree_haver/backends/mri.rb +154 -27
- data/lib/tree_haver/backends/rust.rb +58 -27
- data/lib/tree_haver/citrus_grammar_finder.rb +170 -0
- data/lib/tree_haver/grammar_finder.rb +42 -7
- data/lib/tree_haver/language_registry.rb +62 -71
- data/lib/tree_haver/node.rb +526 -0
- data/lib/tree_haver/path_validator.rb +47 -27
- data/lib/tree_haver/tree.rb +259 -0
- data/lib/tree_haver/version.rb +2 -2
- data/lib/tree_haver.rb +741 -285
- data/sig/tree_haver/backends.rbs +68 -1
- data/sig/tree_haver/path_validator.rbs +1 -0
- data/sig/tree_haver.rbs +95 -9
- data.tar.gz.sig +0 -0
- metadata +12 -8
- metadata.gz.sig +0 -0
|
@@ -0,0 +1,423 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
module TreeHaver
|
|
4
|
+
module Backends
|
|
5
|
+
# Citrus backend using pure Ruby PEG parser
|
|
6
|
+
#
|
|
7
|
+
# This backend wraps Citrus-based parsers (like toml-rb) to provide a
|
|
8
|
+
# pure Ruby alternative to tree-sitter. Citrus is a PEG (Parsing Expression
|
|
9
|
+
# Grammar) parser generator written in Ruby.
|
|
10
|
+
#
|
|
11
|
+
# Unlike tree-sitter backends which are language-agnostic runtime parsers,
|
|
12
|
+
# Citrus parsers are grammar-specific and compiled into Ruby code. Each
|
|
13
|
+
# language needs its own Citrus grammar (e.g., toml-rb for TOML).
|
|
14
|
+
#
|
|
15
|
+
# @note This backend requires a Citrus grammar for the specific language
|
|
16
|
+
# @see https://github.com/mjackson/citrus Citrus parser generator
|
|
17
|
+
# @see https://github.com/emancu/toml-rb toml-rb (TOML Citrus grammar)
|
|
18
|
+
#
|
|
19
|
+
# @example Using with toml-rb
|
|
20
|
+
# require "toml-rb"
|
|
21
|
+
#
|
|
22
|
+
# parser = TreeHaver::Parser.new
|
|
23
|
+
# # For Citrus, "language" is actually a grammar module
|
|
24
|
+
# parser.language = TomlRB::Document
|
|
25
|
+
# tree = parser.parse(toml_source)
|
|
26
|
+
module Citrus
|
|
27
|
+
@load_attempted = false
|
|
28
|
+
@loaded = false
|
|
29
|
+
|
|
30
|
+
# Check if the Citrus backend is available
|
|
31
|
+
#
|
|
32
|
+
# Attempts to require citrus on first call and caches the result.
|
|
33
|
+
#
|
|
34
|
+
# @return [Boolean] true if citrus gem is available
|
|
35
|
+
# @example
|
|
36
|
+
# if TreeHaver::Backends::Citrus.available?
|
|
37
|
+
# puts "Citrus backend is ready"
|
|
38
|
+
# end
|
|
39
|
+
class << self
|
|
40
|
+
def available?
|
|
41
|
+
return @loaded if @load_attempted # rubocop:disable ThreadSafety/ClassInstanceVariable
|
|
42
|
+
@load_attempted = true # rubocop:disable ThreadSafety/ClassInstanceVariable
|
|
43
|
+
begin
|
|
44
|
+
require "citrus"
|
|
45
|
+
|
|
46
|
+
@loaded = true # rubocop:disable ThreadSafety/ClassInstanceVariable
|
|
47
|
+
rescue LoadError
|
|
48
|
+
@loaded = false # rubocop:disable ThreadSafety/ClassInstanceVariable
|
|
49
|
+
end
|
|
50
|
+
@loaded # rubocop:disable ThreadSafety/ClassInstanceVariable
|
|
51
|
+
end
|
|
52
|
+
|
|
53
|
+
# Reset the load state (primarily for testing)
|
|
54
|
+
#
|
|
55
|
+
# @return [void]
|
|
56
|
+
# @api private
|
|
57
|
+
def reset!
|
|
58
|
+
@load_attempted = false # rubocop:disable ThreadSafety/ClassInstanceVariable
|
|
59
|
+
@loaded = false # rubocop:disable ThreadSafety/ClassInstanceVariable
|
|
60
|
+
end
|
|
61
|
+
|
|
62
|
+
# Get capabilities supported by this backend
|
|
63
|
+
#
|
|
64
|
+
# @return [Hash{Symbol => Object}] capability map
|
|
65
|
+
# @example
|
|
66
|
+
# TreeHaver::Backends::Citrus.capabilities
|
|
67
|
+
# # => { backend: :citrus, query: false, bytes_field: true, incremental: false }
|
|
68
|
+
def capabilities
|
|
69
|
+
return {} unless available?
|
|
70
|
+
{
|
|
71
|
+
backend: :citrus,
|
|
72
|
+
query: false, # Citrus doesn't have a query API like tree-sitter
|
|
73
|
+
bytes_field: true, # Citrus::Match provides offset and length
|
|
74
|
+
incremental: false, # Citrus doesn't support incremental parsing
|
|
75
|
+
pure_ruby: true, # Citrus is pure Ruby (portable)
|
|
76
|
+
}
|
|
77
|
+
end
|
|
78
|
+
end
|
|
79
|
+
|
|
80
|
+
# Citrus grammar wrapper
|
|
81
|
+
#
|
|
82
|
+
# Unlike tree-sitter which loads compiled .so files, Citrus uses Ruby modules
|
|
83
|
+
# that define grammars. This class wraps a Citrus grammar module.
|
|
84
|
+
#
|
|
85
|
+
# @example
|
|
86
|
+
# # For TOML, use toml-rb's grammar
|
|
87
|
+
# language = TreeHaver::Backends::Citrus::Language.new(TomlRB::Document)
|
|
88
|
+
class Language
|
|
89
|
+
include Comparable
|
|
90
|
+
|
|
91
|
+
# The Citrus grammar module
|
|
92
|
+
# @return [Module] Citrus grammar module (e.g., TomlRB::Document)
|
|
93
|
+
attr_reader :grammar_module
|
|
94
|
+
|
|
95
|
+
# The backend this language is for
|
|
96
|
+
# @return [Symbol]
|
|
97
|
+
attr_reader :backend
|
|
98
|
+
|
|
99
|
+
# @param grammar_module [Module] A Citrus grammar module with a parse method
|
|
100
|
+
def initialize(grammar_module)
|
|
101
|
+
unless grammar_module.respond_to?(:parse)
|
|
102
|
+
raise TreeHaver::NotAvailable,
|
|
103
|
+
"Grammar module must respond to :parse. " \
|
|
104
|
+
"Expected a Citrus grammar module (e.g., TomlRB::Document)."
|
|
105
|
+
end
|
|
106
|
+
@grammar_module = grammar_module
|
|
107
|
+
@backend = :citrus
|
|
108
|
+
end
|
|
109
|
+
|
|
110
|
+
# Compare languages for equality
|
|
111
|
+
#
|
|
112
|
+
# Citrus languages are equal if they have the same backend and grammar_module.
|
|
113
|
+
# Grammar module uniquely identifies a Citrus language.
|
|
114
|
+
#
|
|
115
|
+
# @param other [Object] object to compare with
|
|
116
|
+
# @return [Integer, nil] -1, 0, 1, or nil if not comparable
|
|
117
|
+
def <=>(other)
|
|
118
|
+
return unless other.is_a?(Language)
|
|
119
|
+
return unless other.backend == @backend
|
|
120
|
+
|
|
121
|
+
# Compare by grammar_module name (modules are compared by object_id by default)
|
|
122
|
+
@grammar_module.name <=> other.grammar_module.name
|
|
123
|
+
end
|
|
124
|
+
|
|
125
|
+
# Hash value for this language (for use in Sets/Hashes)
|
|
126
|
+
# @return [Integer]
|
|
127
|
+
def hash
|
|
128
|
+
[@backend, @grammar_module.name].hash
|
|
129
|
+
end
|
|
130
|
+
|
|
131
|
+
# Alias eql? to ==
|
|
132
|
+
alias_method :eql?, :==
|
|
133
|
+
|
|
134
|
+
# Not applicable for Citrus (tree-sitter-specific)
|
|
135
|
+
#
|
|
136
|
+
# Citrus grammars are Ruby modules, not shared libraries.
|
|
137
|
+
# This method exists for API compatibility but will raise an error.
|
|
138
|
+
#
|
|
139
|
+
# @raise [TreeHaver::NotAvailable] always raises
|
|
140
|
+
class << self
|
|
141
|
+
def from_library(path, symbol: nil, name: nil)
|
|
142
|
+
raise TreeHaver::NotAvailable,
|
|
143
|
+
"Citrus backend doesn't use shared libraries. " \
|
|
144
|
+
"Use Citrus::Language.new(GrammarModule) instead."
|
|
145
|
+
end
|
|
146
|
+
|
|
147
|
+
alias_method :from_path, :from_library
|
|
148
|
+
end
|
|
149
|
+
end
|
|
150
|
+
|
|
151
|
+
# Citrus parser wrapper
|
|
152
|
+
#
|
|
153
|
+
# Wraps Citrus grammar modules to provide a tree-sitter-like API.
|
|
154
|
+
class Parser
|
|
155
|
+
# Create a new Citrus parser instance
|
|
156
|
+
#
|
|
157
|
+
# @raise [TreeHaver::NotAvailable] if citrus gem is not available
|
|
158
|
+
def initialize
|
|
159
|
+
raise TreeHaver::NotAvailable, "citrus gem not available" unless Citrus.available?
|
|
160
|
+
@grammar = nil
|
|
161
|
+
end
|
|
162
|
+
|
|
163
|
+
# Set the grammar for this parser
|
|
164
|
+
#
|
|
165
|
+
# Note: TreeHaver::Parser unwraps language objects before calling this method.
|
|
166
|
+
# This backend receives the raw Citrus grammar module (unwrapped), not the Language wrapper.
|
|
167
|
+
#
|
|
168
|
+
# @param grammar [Module] Citrus grammar module with a parse method
|
|
169
|
+
# @return [void]
|
|
170
|
+
# @example
|
|
171
|
+
# require "toml-rb"
|
|
172
|
+
# # TreeHaver::Parser unwraps Language.new(TomlRB::Document) to just TomlRB::Document
|
|
173
|
+
# parser.language = TomlRB::Document # Backend receives unwrapped module
|
|
174
|
+
def language=(grammar)
|
|
175
|
+
# grammar is already unwrapped by TreeHaver::Parser
|
|
176
|
+
unless grammar.respond_to?(:parse)
|
|
177
|
+
raise ArgumentError,
|
|
178
|
+
"Expected Citrus grammar module with parse method, " \
|
|
179
|
+
"got #{grammar.class}"
|
|
180
|
+
end
|
|
181
|
+
@grammar = grammar
|
|
182
|
+
end
|
|
183
|
+
|
|
184
|
+
# Parse source code
|
|
185
|
+
#
|
|
186
|
+
# @param source [String] the source code to parse
|
|
187
|
+
# @return [Tree] raw backend tree (wrapping happens in TreeHaver::Parser)
|
|
188
|
+
# @raise [TreeHaver::NotAvailable] if no grammar is set
|
|
189
|
+
# @raise [::Citrus::ParseError] if parsing fails
|
|
190
|
+
def parse(source)
|
|
191
|
+
raise TreeHaver::NotAvailable, "No grammar loaded" unless @grammar
|
|
192
|
+
|
|
193
|
+
begin
|
|
194
|
+
citrus_match = @grammar.parse(source)
|
|
195
|
+
# Return raw Citrus::Tree - TreeHaver::Parser will wrap it
|
|
196
|
+
Tree.new(citrus_match, source)
|
|
197
|
+
rescue ::Citrus::ParseError => e
|
|
198
|
+
# Re-raise with more context
|
|
199
|
+
raise TreeHaver::Error, "Parse error: #{e.message}"
|
|
200
|
+
end
|
|
201
|
+
end
|
|
202
|
+
|
|
203
|
+
# Parse source code (compatibility with tree-sitter API)
|
|
204
|
+
#
|
|
205
|
+
# Citrus doesn't support incremental parsing, so old_tree is ignored.
|
|
206
|
+
#
|
|
207
|
+
# @param old_tree [TreeHaver::Tree, nil] ignored (no incremental parsing support)
|
|
208
|
+
# @param source [String] the source code to parse
|
|
209
|
+
# @return [Tree] raw backend tree (wrapping happens in TreeHaver::Parser)
|
|
210
|
+
def parse_string(old_tree, source) # rubocop:disable Lint/UnusedMethodArgument
|
|
211
|
+
parse(source) # Citrus doesn't support incremental parsing
|
|
212
|
+
end
|
|
213
|
+
end
|
|
214
|
+
|
|
215
|
+
# Citrus tree wrapper
|
|
216
|
+
#
|
|
217
|
+
# Wraps a Citrus::Match (which represents the parse tree) to provide
|
|
218
|
+
# tree-sitter-compatible API.
|
|
219
|
+
#
|
|
220
|
+
# @api private
|
|
221
|
+
class Tree
|
|
222
|
+
attr_reader :root_match, :source
|
|
223
|
+
|
|
224
|
+
def initialize(root_match, source)
|
|
225
|
+
@root_match = root_match
|
|
226
|
+
@source = source
|
|
227
|
+
end
|
|
228
|
+
|
|
229
|
+
def root_node
|
|
230
|
+
Node.new(@root_match, @source)
|
|
231
|
+
end
|
|
232
|
+
end
|
|
233
|
+
|
|
234
|
+
# Citrus node wrapper
|
|
235
|
+
#
|
|
236
|
+
# Wraps Citrus::Match objects to provide tree-sitter-compatible node API.
|
|
237
|
+
#
|
|
238
|
+
# Citrus::Match provides:
|
|
239
|
+
# - events[0]: rule name (Symbol) - used as type
|
|
240
|
+
# - offset: byte position
|
|
241
|
+
# - length: byte length
|
|
242
|
+
# - string: matched text
|
|
243
|
+
# - matches: child matches
|
|
244
|
+
# - captures: named groups
|
|
245
|
+
#
|
|
246
|
+
# Language-specific helpers can be mixed in for convenience:
|
|
247
|
+
# require "tree_haver/backends/citrus/toml_helpers"
|
|
248
|
+
# TreeHaver::Backends::Citrus::Node.include(TreeHaver::Backends::Citrus::TomlHelpers)
|
|
249
|
+
#
|
|
250
|
+
# @api private
|
|
251
|
+
class Node
|
|
252
|
+
attr_reader :match, :source
|
|
253
|
+
|
|
254
|
+
def initialize(match, source)
|
|
255
|
+
@match = match
|
|
256
|
+
@source = source
|
|
257
|
+
end
|
|
258
|
+
|
|
259
|
+
# Get node type from Citrus rule name
|
|
260
|
+
#
|
|
261
|
+
# Uses Citrus grammar introspection to dynamically determine node types.
|
|
262
|
+
# Works with any Citrus grammar without language-specific knowledge.
|
|
263
|
+
#
|
|
264
|
+
# Strategy:
|
|
265
|
+
# 1. Check if first event has a .name method (returns Symbol) - use that
|
|
266
|
+
# 2. If first event is a Symbol directly - use that
|
|
267
|
+
# 3. For compound rules (Repeat, Choice), recurse into first match
|
|
268
|
+
#
|
|
269
|
+
# @return [String] rule name from grammar
|
|
270
|
+
def type
|
|
271
|
+
return "unknown" unless @match.respond_to?(:events)
|
|
272
|
+
return "unknown" unless @match.events.is_a?(Array)
|
|
273
|
+
return "unknown" if @match.events.empty?
|
|
274
|
+
|
|
275
|
+
extract_type_from_event(@match.events.first)
|
|
276
|
+
end
|
|
277
|
+
|
|
278
|
+
# Check if this node represents a structural element vs a terminal/token
|
|
279
|
+
#
|
|
280
|
+
# Uses Citrus grammar's terminal? method to determine if this is
|
|
281
|
+
# a structural rule (like "table", "keyvalue") vs a terminal token
|
|
282
|
+
# (like "[", "=", whitespace).
|
|
283
|
+
#
|
|
284
|
+
# @return [Boolean] true if this is a structural (non-terminal) node
|
|
285
|
+
def structural?
|
|
286
|
+
return false unless @match.respond_to?(:events)
|
|
287
|
+
return false if @match.events.empty?
|
|
288
|
+
|
|
289
|
+
first_event = @match.events.first
|
|
290
|
+
|
|
291
|
+
# Check if event has terminal? method (Citrus rule object)
|
|
292
|
+
if first_event.respond_to?(:terminal?)
|
|
293
|
+
return !first_event.terminal?
|
|
294
|
+
end
|
|
295
|
+
|
|
296
|
+
# For Symbol events, try to look up in grammar
|
|
297
|
+
if first_event.is_a?(Symbol) && @match.respond_to?(:grammar)
|
|
298
|
+
grammar = @match.grammar
|
|
299
|
+
if grammar.respond_to?(:rules) && grammar.rules.key?(first_event)
|
|
300
|
+
rule = grammar.rules[first_event]
|
|
301
|
+
return !rule.terminal? if rule.respond_to?(:terminal?)
|
|
302
|
+
end
|
|
303
|
+
end
|
|
304
|
+
|
|
305
|
+
# Default: assume structural if not a simple string/regex terminal
|
|
306
|
+
true
|
|
307
|
+
end
|
|
308
|
+
|
|
309
|
+
private
|
|
310
|
+
|
|
311
|
+
# Extract type name from a Citrus event object
|
|
312
|
+
#
|
|
313
|
+
# Handles different event types:
|
|
314
|
+
# - Objects with .name method (Citrus rule objects) -> use .name
|
|
315
|
+
# - Symbol -> use directly
|
|
316
|
+
# - Compound rules (Repeat, Choice) -> check string representation
|
|
317
|
+
#
|
|
318
|
+
# @param event [Object] Citrus event object
|
|
319
|
+
# @return [String] type name
|
|
320
|
+
def extract_type_from_event(event)
|
|
321
|
+
# Case 1: Event has .name method (returns Symbol)
|
|
322
|
+
if event.respond_to?(:name)
|
|
323
|
+
name = event.name
|
|
324
|
+
return name.to_s if name.is_a?(Symbol)
|
|
325
|
+
end
|
|
326
|
+
|
|
327
|
+
# Case 2: Event is a Symbol directly (most common for child nodes)
|
|
328
|
+
return event.to_s if event.is_a?(Symbol)
|
|
329
|
+
|
|
330
|
+
# Case 3: Event is a String
|
|
331
|
+
return event if event.is_a?(String)
|
|
332
|
+
|
|
333
|
+
# Case 4: For compound rules (Repeat, Choice), try string parsing first
|
|
334
|
+
# This avoids recursion issues
|
|
335
|
+
str = event.to_s
|
|
336
|
+
|
|
337
|
+
# Try to extract rule name from string representation
|
|
338
|
+
# Examples: "table", "(comment | table)*", "space?", etc.
|
|
339
|
+
if str =~ /^([a-z_][a-z0-9_]*)/i
|
|
340
|
+
return $1
|
|
341
|
+
end
|
|
342
|
+
|
|
343
|
+
# If we have a pattern like "(rule1 | rule2)*", we can't determine
|
|
344
|
+
# the type without looking at actual matches, but that causes recursion
|
|
345
|
+
# So just return a generic type based on the pattern
|
|
346
|
+
if /^\(.*\)\*$/.match?(str)
|
|
347
|
+
return "repeat"
|
|
348
|
+
elsif /^\(.*\)\?$/.match?(str)
|
|
349
|
+
return "optional"
|
|
350
|
+
elsif /^.*\|.*$/.match?(str)
|
|
351
|
+
return "choice"
|
|
352
|
+
end
|
|
353
|
+
|
|
354
|
+
"unknown"
|
|
355
|
+
end
|
|
356
|
+
|
|
357
|
+
public
|
|
358
|
+
|
|
359
|
+
def start_byte
|
|
360
|
+
@match.offset
|
|
361
|
+
end
|
|
362
|
+
|
|
363
|
+
def end_byte
|
|
364
|
+
@match.offset + @match.length
|
|
365
|
+
end
|
|
366
|
+
|
|
367
|
+
def start_point
|
|
368
|
+
calculate_point(@match.offset)
|
|
369
|
+
end
|
|
370
|
+
|
|
371
|
+
def end_point
|
|
372
|
+
calculate_point(@match.offset + @match.length)
|
|
373
|
+
end
|
|
374
|
+
|
|
375
|
+
def text
|
|
376
|
+
@match.string
|
|
377
|
+
end
|
|
378
|
+
|
|
379
|
+
def child_count
|
|
380
|
+
@match.respond_to?(:matches) ? @match.matches.size : 0
|
|
381
|
+
end
|
|
382
|
+
|
|
383
|
+
def child(index)
|
|
384
|
+
return unless @match.respond_to?(:matches)
|
|
385
|
+
return if index >= @match.matches.size
|
|
386
|
+
|
|
387
|
+
Node.new(@match.matches[index], @source)
|
|
388
|
+
end
|
|
389
|
+
|
|
390
|
+
def children
|
|
391
|
+
return [] unless @match.respond_to?(:matches)
|
|
392
|
+
@match.matches.map { |m| Node.new(m, @source) }
|
|
393
|
+
end
|
|
394
|
+
|
|
395
|
+
def each(&block)
|
|
396
|
+
return to_enum(__method__) unless block_given?
|
|
397
|
+
children.each(&block)
|
|
398
|
+
end
|
|
399
|
+
|
|
400
|
+
def has_error?
|
|
401
|
+
false # Citrus raises on parse error, so successful parse has no errors
|
|
402
|
+
end
|
|
403
|
+
|
|
404
|
+
def missing?
|
|
405
|
+
false # Citrus doesn't have the concept of missing nodes
|
|
406
|
+
end
|
|
407
|
+
|
|
408
|
+
def named?
|
|
409
|
+
true # Citrus matches are typically "named" in tree-sitter terminology
|
|
410
|
+
end
|
|
411
|
+
|
|
412
|
+
private
|
|
413
|
+
|
|
414
|
+
def calculate_point(offset)
|
|
415
|
+
lines_before = @source[0...offset].count("\n")
|
|
416
|
+
line_start = @source.rindex("\n", offset - 1) || -1
|
|
417
|
+
column = offset - line_start - 1
|
|
418
|
+
{row: lines_before, column: column}
|
|
419
|
+
end
|
|
420
|
+
end
|
|
421
|
+
end
|
|
422
|
+
end
|
|
423
|
+
end
|