tree_haver 1.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,175 @@
1
+ # frozen_string_literal: true
2
+
3
+ module TreeHaver
4
+ module Backends
5
+ # Rust backend using the tree_stump gem
6
+ #
7
+ # This backend wraps the tree_stump gem, which provides Ruby bindings to
8
+ # Tree-sitter written in Rust. It offers native performance with Rust's
9
+ # safety guarantees and includes precompiled binaries for common platforms.
10
+ #
11
+ # tree_stump supports incremental parsing and the Query API, making it
12
+ # suitable for editor/IDE use cases where performance is critical.
13
+ #
14
+ # @note This backend works on MRI Ruby. JRuby/TruffleRuby support is unknown.
15
+ # @see https://github.com/anthropics/tree_stump tree_stump
16
+ module Rust
17
+ @load_attempted = false
18
+ @loaded = false
19
+
20
+ # Check if the Rust backend is available
21
+ #
22
+ # Attempts to require tree_stump on first call and caches the result.
23
+ #
24
+ # @return [Boolean] true if tree_stump is available
25
+ # @example
26
+ # if TreeHaver::Backends::Rust.available?
27
+ # puts "Rust backend is ready"
28
+ # end
29
+ class << self
30
+ def available?
31
+ return @loaded if @load_attempted # rubocop:disable ThreadSafety/ClassInstanceVariable
32
+ @load_attempted = true # rubocop:disable ThreadSafety/ClassInstanceVariable
33
+ begin
34
+ require "tree_stump"
35
+
36
+ @loaded = true # rubocop:disable ThreadSafety/ClassInstanceVariable
37
+ rescue LoadError
38
+ @loaded = false # rubocop:disable ThreadSafety/ClassInstanceVariable
39
+ end
40
+ @loaded # rubocop:disable ThreadSafety/ClassInstanceVariable
41
+ end
42
+
43
+ # Reset the load state (primarily for testing)
44
+ #
45
+ # @return [void]
46
+ # @api private
47
+ def reset!
48
+ @load_attempted = false # rubocop:disable ThreadSafety/ClassInstanceVariable
49
+ @loaded = false # rubocop:disable ThreadSafety/ClassInstanceVariable
50
+ end
51
+
52
+ # Get capabilities supported by this backend
53
+ #
54
+ # @return [Hash{Symbol => Object}] capability map
55
+ # @example
56
+ # TreeHaver::Backends::Rust.capabilities
57
+ # # => { backend: :rust, query: true, bytes_field: true, incremental: true }
58
+ def capabilities
59
+ return {} unless available?
60
+ {
61
+ backend: :rust,
62
+ query: true,
63
+ bytes_field: true,
64
+ incremental: true,
65
+ }
66
+ end
67
+ end
68
+
69
+ # Wrapper for tree_stump Language
70
+ #
71
+ # Provides TreeHaver-compatible interface to tree_stump's language loading.
72
+ # tree_stump uses a registration-based API where languages are registered
73
+ # by name, then referenced by that name when setting parser language.
74
+ class Language
75
+ # The registered language name
76
+ # @return [String]
77
+ attr_reader :name
78
+
79
+ # @api private
80
+ # @param name [String] the registered language name
81
+ def initialize(name)
82
+ @name = name
83
+ end
84
+
85
+ # Load a language from a shared library path
86
+ #
87
+ # @param path [String] absolute path to the language shared library
88
+ # @param symbol [String, nil] the symbol name (accepted for API consistency, but tree_stump derives it from name)
89
+ # @param name [String, nil] logical name for the language (optional, derived from path if not provided)
90
+ # @return [Language] a wrapper holding the registered language name
91
+ # @raise [TreeHaver::NotAvailable] if tree_stump is not available
92
+ # @example
93
+ # lang = TreeHaver::Backends::Rust::Language.from_library("/usr/local/lib/libtree-sitter-toml.so")
94
+ class << self
95
+ def from_library(path, symbol: nil, name: nil) # rubocop:disable Lint/UnusedMethodArgument
96
+ raise TreeHaver::NotAvailable, "tree_stump not available" unless Rust.available?
97
+
98
+ # Validate the path exists before calling register_lang to provide a clear error
99
+ raise TreeHaver::NotAvailable, "Language library not found: #{path}" unless File.exist?(path)
100
+
101
+ # tree_stump uses TreeStump.register_lang(name, path) to register languages
102
+ # The name is used to derive the symbol automatically (tree_sitter_<name>)
103
+ lang_name = name || File.basename(path, ".*").sub(/^libtree-sitter-/, "")
104
+ ::TreeStump.register_lang(lang_name, path)
105
+ new(lang_name)
106
+ rescue RuntimeError => e
107
+ raise TreeHaver::NotAvailable, "Failed to load language from #{path}: #{e.message}"
108
+ end
109
+
110
+ # Alias for compatibility
111
+ #
112
+ # @see from_library
113
+ alias_method :from_path, :from_library
114
+ end
115
+ end
116
+
117
+ # Wrapper for tree_stump Parser
118
+ #
119
+ # Provides TreeHaver-compatible interface to tree_stump's parser.
120
+ class Parser
121
+ # Create a new parser instance
122
+ #
123
+ # @raise [TreeHaver::NotAvailable] if tree_stump is not available
124
+ def initialize
125
+ raise TreeHaver::NotAvailable, "tree_stump not available" unless Rust.available?
126
+ @parser = ::TreeStump::Parser.new
127
+ end
128
+
129
+ # Set the language for this parser
130
+ #
131
+ # @param lang [Language, String] the language to use (Language wrapper or name string)
132
+ # @return [Language, String] the language that was set
133
+ def language=(lang)
134
+ # tree_stump uses set_language with a string name
135
+ lang_name = lang.respond_to?(:name) ? lang.name : lang.to_s
136
+ @parser.set_language(lang_name)
137
+ lang
138
+ end
139
+
140
+ # Parse source code
141
+ #
142
+ # @param source [String] the source code to parse
143
+ # @return [Object] the parsed syntax tree
144
+ def parse(source)
145
+ @parser.parse(source)
146
+ end
147
+
148
+ # Parse source code with optional incremental parsing
149
+ #
150
+ # @param old_tree [Object, nil] previous tree for incremental parsing
151
+ # @param source [String] the source code to parse
152
+ # @return [Object] the parsed syntax tree
153
+ def parse_string(old_tree, source)
154
+ # tree_stump doesn't have parse_string, use parse instead
155
+ # TODO: Check if tree_stump supports incremental parsing
156
+ @parser.parse(source)
157
+ end
158
+ end
159
+
160
+ # Wrapper for tree_stump Tree
161
+ #
162
+ # Not used directly; TreeHaver passes through tree_stump Tree objects.
163
+ class Tree
164
+ # Not used directly; we pass through tree_stump::Tree
165
+ end
166
+
167
+ # Wrapper for tree_stump Node
168
+ #
169
+ # Not used directly; TreeHaver passes through tree_stump::Node objects.
170
+ class Node
171
+ # Not used directly; we pass through tree_stump::Node
172
+ end
173
+ end
174
+ end
175
+ end
@@ -0,0 +1,43 @@
1
+ # frozen_string_literal: true
2
+
3
+ # Compatibility shim for code that expects TreeSitter constants
4
+ #
5
+ # When required, this file creates a TreeSitter module that maps to TreeHaver
6
+ # equivalents, allowing code written for ruby_tree_sitter to work with TreeHaver
7
+ # without modification.
8
+ #
9
+ # This shim is safe and idempotent:
10
+ # - If TreeSitter is already defined (real ruby_tree_sitter is loaded), this does nothing
11
+ # - If TreeSitter is not defined, it creates aliases to TreeHaver
12
+ #
13
+ # @example Using the compatibility shim
14
+ # require "tree_haver/compat"
15
+ #
16
+ # # Now code expecting TreeSitter will work
17
+ # parser = TreeSitter::Parser.new # Actually creates TreeHaver::Parser
18
+ # tree = parser.parse(source)
19
+ #
20
+ # @note This is an opt-in feature. Only require this file if you need compatibility
21
+ # @see TreeHaver The main module this aliases to
22
+
23
+ unless defined?(TreeSitter)
24
+ # Compatibility module aliasing TreeHaver classes to TreeSitter
25
+ #
26
+ # @note Only defined if TreeSitter doesn't already exist
27
+ module TreeSitter; end
28
+
29
+ # @!parse
30
+ # module TreeSitter
31
+ # Error = TreeHaver::Error
32
+ # Parser = TreeHaver::Parser
33
+ # Tree = TreeHaver::Tree
34
+ # Node = TreeHaver::Node
35
+ # Language = TreeHaver::Language
36
+ # end
37
+
38
+ TreeSitter::Error = TreeHaver::Error
39
+ TreeSitter::Parser = TreeHaver::Parser
40
+ TreeSitter::Tree = TreeHaver::Tree
41
+ TreeSitter::Node = TreeHaver::Node
42
+ TreeSitter::Language = TreeHaver::Language
43
+ end
@@ -0,0 +1,245 @@
1
+ # frozen_string_literal: true
2
+
3
+ require "rbconfig"
4
+
5
+ module TreeHaver
6
+ # Generic utility for finding tree-sitter grammar shared libraries.
7
+ #
8
+ # GrammarFinder provides platform-aware discovery of tree-sitter grammar
9
+ # libraries. Given a language name, it searches common installation paths
10
+ # and supports environment variable overrides.
11
+ #
12
+ # This class is designed to be used by language-specific merge gems
13
+ # (toml-merge, json-merge, bash-merge, etc.) without requiring TreeHaver
14
+ # to have knowledge of each specific language.
15
+ #
16
+ # == Security Considerations
17
+ #
18
+ # Loading shared libraries is inherently dangerous as it executes arbitrary
19
+ # native code. GrammarFinder performs the following security validations:
20
+ #
21
+ # - Language names are validated to contain only safe characters
22
+ # - Paths from environment variables are validated before use
23
+ # - Path traversal attempts (../) are rejected
24
+ # - Only files with expected extensions (.so, .dylib, .dll) are accepted
25
+ #
26
+ # For additional security, use {#find_library_path_safe} which only returns
27
+ # paths from trusted system directories.
28
+ #
29
+ # @example Basic usage
30
+ # finder = TreeHaver::GrammarFinder.new(:toml)
31
+ # path = finder.find_library_path
32
+ # # => "/usr/lib/libtree-sitter-toml.so"
33
+ #
34
+ # @example Check availability
35
+ # finder = TreeHaver::GrammarFinder.new(:json)
36
+ # if finder.available?
37
+ # language = TreeHaver::Language.load(finder.language_name, finder.find_library_path)
38
+ # end
39
+ #
40
+ # @example Register with TreeHaver
41
+ # finder = TreeHaver::GrammarFinder.new(:bash)
42
+ # finder.register! if finder.available?
43
+ # # Now you can use: TreeHaver::Language.bash
44
+ #
45
+ # @example With custom search paths
46
+ # finder = TreeHaver::GrammarFinder.new(:toml, extra_paths: ["/opt/custom/lib"])
47
+ #
48
+ # @example Secure mode (trusted directories only)
49
+ # finder = TreeHaver::GrammarFinder.new(:toml)
50
+ # path = finder.find_library_path_safe # Only returns paths in trusted dirs
51
+ #
52
+ # @see PathValidator For details on security validations
53
+ class GrammarFinder
54
+ # Common base directories where tree-sitter libraries are installed
55
+ # Platform-specific extensions are appended automatically
56
+ BASE_SEARCH_DIRS = [
57
+ "/usr/lib",
58
+ "/usr/lib64",
59
+ "/usr/local/lib",
60
+ "/opt/homebrew/lib",
61
+ ].freeze
62
+
63
+ # @return [Symbol] the language identifier
64
+ attr_reader :language_name
65
+
66
+ # @return [Array<String>] additional search paths provided at initialization
67
+ attr_reader :extra_paths
68
+
69
+ # Initialize a grammar finder for a specific language
70
+ #
71
+ # @param language_name [Symbol, String] the tree-sitter language name (e.g., :toml, :json, :bash)
72
+ # @param extra_paths [Array<String>] additional paths to search (searched first after ENV)
73
+ # @param validate [Boolean] if true, validates the language name (default: true)
74
+ # @raise [ArgumentError] if language_name is invalid and validate is true
75
+ def initialize(language_name, extra_paths: [], validate: true)
76
+ name_str = language_name.to_s.downcase
77
+
78
+ if validate && !PathValidator.safe_language_name?(name_str)
79
+ raise ArgumentError, "Invalid language name: #{language_name.inspect}. " \
80
+ "Language names must start with a letter and contain only lowercase letters, numbers, and underscores."
81
+ end
82
+
83
+ @language_name = name_str.to_sym
84
+ @extra_paths = Array(extra_paths)
85
+ end
86
+
87
+ # Get the environment variable name for this language
88
+ #
89
+ # @return [String] the ENV var name (e.g., "TREE_SITTER_TOML_PATH")
90
+ def env_var_name
91
+ "TREE_SITTER_#{@language_name.to_s.upcase}_PATH"
92
+ end
93
+
94
+ # Get the expected symbol name exported by the grammar library
95
+ #
96
+ # @return [String] the symbol name (e.g., "tree_sitter_toml")
97
+ def symbol_name
98
+ "tree_sitter_#{@language_name}"
99
+ end
100
+
101
+ # Get the library filename for the current platform
102
+ #
103
+ # @return [String] the library filename (e.g., "libtree-sitter-toml.so")
104
+ def library_filename
105
+ ext = platform_extension
106
+ "libtree-sitter-#{@language_name}#{ext}"
107
+ end
108
+
109
+ # Generate the full list of search paths for this language
110
+ #
111
+ # Order: ENV override, extra_paths, then common system paths
112
+ #
113
+ # @return [Array<String>] all paths to search
114
+ def search_paths
115
+ paths = []
116
+
117
+ # Extra paths provided at initialization (searched after ENV)
118
+ @extra_paths.each do |dir|
119
+ paths << File.join(dir, library_filename)
120
+ end
121
+
122
+ # Common system paths with platform-appropriate extension
123
+ BASE_SEARCH_DIRS.each do |dir|
124
+ paths << File.join(dir, library_filename)
125
+ end
126
+
127
+ paths
128
+ end
129
+
130
+ # Find the grammar library path
131
+ #
132
+ # Searches in order:
133
+ # 1. Environment variable override (validated for safety)
134
+ # 2. Extra paths provided at initialization
135
+ # 3. Common system installation paths
136
+ #
137
+ # @note Paths from ENV are validated using {PathValidator.safe_library_path?}
138
+ # to prevent path traversal and other attacks. Invalid ENV paths are ignored.
139
+ #
140
+ # @return [String, nil] the path to the library, or nil if not found
141
+ # @see #find_library_path_safe For stricter validation (trusted directories only)
142
+ def find_library_path
143
+ # Check environment variable first (highest priority)
144
+ env_path = ENV[env_var_name]
145
+ if env_path && PathValidator.safe_library_path?(env_path) && File.exist?(env_path)
146
+ return env_path
147
+ end
148
+
149
+ # Search all paths (these are constructed from trusted base dirs)
150
+ search_paths.find { |path| File.exist?(path) }
151
+ end
152
+
153
+ # Find the grammar library path with strict security validation
154
+ #
155
+ # This method only returns paths that are in trusted system directories.
156
+ # Use this when you want maximum security and don't need to support
157
+ # custom installation locations.
158
+ #
159
+ # @return [String, nil] the path to the library, or nil if not found
160
+ # @see PathValidator::TRUSTED_DIRECTORIES For the list of trusted directories
161
+ def find_library_path_safe
162
+ # Environment variable is NOT checked in safe mode - only trusted system paths
163
+ search_paths.find do |path|
164
+ File.exist?(path) && PathValidator.in_trusted_directory?(path)
165
+ end
166
+ end
167
+
168
+ # Check if the grammar library is available
169
+ #
170
+ # @return [Boolean] true if the library can be found
171
+ def available?
172
+ !find_library_path.nil?
173
+ end
174
+
175
+ # Check if the grammar library is available in a trusted directory
176
+ #
177
+ # @return [Boolean] true if the library can be found in a trusted directory
178
+ # @see #find_library_path_safe
179
+ def available_safe?
180
+ !find_library_path_safe.nil?
181
+ end
182
+
183
+ # Register this language with TreeHaver
184
+ #
185
+ # After registration, the language can be loaded via dynamic method
186
+ # (e.g., `TreeHaver::Language.toml`).
187
+ #
188
+ # @param raise_on_missing [Boolean] if true, raises when library not found
189
+ # @return [Boolean] true if registration succeeded
190
+ # @raise [NotAvailable] if library not found and raise_on_missing is true
191
+ def register!(raise_on_missing: false)
192
+ path = find_library_path
193
+ unless path
194
+ if raise_on_missing
195
+ raise NotAvailable, not_found_message
196
+ end
197
+ return false
198
+ end
199
+
200
+ TreeHaver.register_language(@language_name, path: path, symbol: symbol_name)
201
+ true
202
+ end
203
+
204
+ # Get debug information about the search
205
+ #
206
+ # @return [Hash] diagnostic information
207
+ def search_info
208
+ {
209
+ language: @language_name,
210
+ env_var: env_var_name,
211
+ env_value: ENV[env_var_name],
212
+ symbol: symbol_name,
213
+ library_filename: library_filename,
214
+ search_paths: search_paths,
215
+ found_path: find_library_path,
216
+ available: available?,
217
+ }
218
+ end
219
+
220
+ # Get a human-readable error message when library is not found
221
+ #
222
+ # @return [String] error message with installation hints
223
+ def not_found_message
224
+ "Tree-sitter #{@language_name} grammar not found. " \
225
+ "Searched: #{search_paths.join(", ")}. " \
226
+ "Install tree-sitter-#{@language_name} or set #{env_var_name}."
227
+ end
228
+
229
+ private
230
+
231
+ # Get the platform-appropriate shared library extension
232
+ #
233
+ # @return [String] ".so" on Linux, ".dylib" on macOS
234
+ def platform_extension
235
+ case RbConfig::CONFIG["host_os"]
236
+ when /darwin/i
237
+ ".dylib"
238
+ when /mswin|mingw|cygwin/i
239
+ ".dll"
240
+ else
241
+ ".so"
242
+ end
243
+ end
244
+ end
245
+ end
@@ -0,0 +1,139 @@
1
+ # frozen_string_literal: true
2
+
3
+ module TreeHaver
4
+ # Thread-safe language registrations and cache for loaded Language handles
5
+ #
6
+ # The LanguageRegistry provides two main functions:
7
+ # 1. **Registrations**: Store mappings from language names to shared library paths
8
+ # 2. **Cache**: Memoize loaded Language objects to avoid repeated dlopen calls
9
+ #
10
+ # All operations are thread-safe and protected by a mutex.
11
+ #
12
+ # @example Register and cache a language
13
+ # TreeHaver::LanguageRegistry.register(:toml, path: "/path/to/lib.so", symbol: "tree_sitter_toml")
14
+ # lang = TreeHaver::LanguageRegistry.fetch(["/path/to/lib.so", "tree_sitter_toml", "toml"]) do
15
+ # # This block is called only if not cached
16
+ # load_language_from_library(...)
17
+ # end
18
+ #
19
+ # @api private
20
+ module LanguageRegistry
21
+ @mutex = Mutex.new
22
+ @cache = {}
23
+ @registrations = {}
24
+
25
+ module_function
26
+
27
+ # Register a language helper by name
28
+ #
29
+ # Stores a mapping from a language name to its shared library path and
30
+ # optional exported symbol name. After registration, the language can be
31
+ # accessed via dynamic helpers on {TreeHaver::Language}.
32
+ #
33
+ # @param name [Symbol, String] language identifier (e.g., :toml, :json)
34
+ # @param path [String] absolute path to the language shared library
35
+ # @param symbol [String, nil] optional exported factory symbol (e.g., "tree_sitter_toml")
36
+ # @return [void]
37
+ # @example
38
+ # LanguageRegistry.register(:toml, path: "/usr/local/lib/libtree-sitter-toml.so")
39
+ def register(name, path:, symbol: nil)
40
+ key = name.to_sym
41
+ @mutex.synchronize do
42
+ @registrations[key] = {path: path, symbol: symbol}
43
+ end
44
+ nil
45
+ end
46
+
47
+ # Unregister a previously registered language helper
48
+ #
49
+ # Removes the registration entry but does not affect cached Language objects.
50
+ #
51
+ # @param name [Symbol, String] language identifier to unregister
52
+ # @return [void]
53
+ # @example
54
+ # LanguageRegistry.unregister(:toml)
55
+ def unregister(name)
56
+ key = name.to_sym
57
+ @mutex.synchronize do
58
+ @registrations.delete(key)
59
+ end
60
+ nil
61
+ end
62
+
63
+ # Fetch a registration entry
64
+ #
65
+ # Returns the stored path and symbol for a registered language name.
66
+ #
67
+ # @param name [Symbol, String] language identifier
68
+ # @return [Hash{Symbol => String, nil}, nil] hash with :path and :symbol keys, or nil if not registered
69
+ # @example
70
+ # entry = LanguageRegistry.registered(:toml)
71
+ # # => { path: "/usr/local/lib/libtree-sitter-toml.so", symbol: "tree_sitter_toml" }
72
+ def registered(name)
73
+ @mutex.synchronize { @registrations[name.to_sym] }
74
+ end
75
+
76
+ # Clear all registrations
77
+ #
78
+ # Removes all registered language mappings. Primarily intended for test cleanup.
79
+ # Does not clear the language cache.
80
+ #
81
+ # @return [void]
82
+ # @example
83
+ # LanguageRegistry.clear_registrations!
84
+ def clear_registrations!
85
+ @mutex.synchronize { @registrations.clear }
86
+ nil
87
+ end
88
+
89
+ # Fetch a cached language by key or compute and store it
90
+ #
91
+ # This method provides thread-safe memoization for loaded Language objects.
92
+ # If the key exists in the cache, the cached value is returned immediately.
93
+ # Otherwise, the block is called to compute the value, which is then cached.
94
+ #
95
+ # @param key [Array] cache key, typically [path, symbol, name]
96
+ # @yieldreturn [Object] the computed language handle (called only on cache miss)
97
+ # @return [Object] the cached or computed language handle
98
+ # @example
99
+ # language = LanguageRegistry.fetch(["/path/lib.so", "symbol", "toml"]) do
100
+ # expensive_language_load_operation
101
+ # end
102
+ def fetch(key)
103
+ @mutex.synchronize do
104
+ return @cache[key] if @cache.key?(key)
105
+ value = yield
106
+ @cache[key] = value
107
+ end
108
+ end
109
+
110
+ # Clear the language cache
111
+ #
112
+ # Removes all cached Language objects. The next call to {fetch} for any key
113
+ # will recompute the value. Does not clear registrations.
114
+ #
115
+ # @return [void]
116
+ # @example
117
+ # LanguageRegistry.clear_cache!
118
+ def clear_cache!
119
+ @mutex.synchronize { @cache.clear }
120
+ nil
121
+ end
122
+
123
+ # Clear everything (registrations and cache)
124
+ #
125
+ # Removes all registered languages and all cached Language objects.
126
+ # Useful for complete teardown in tests.
127
+ #
128
+ # @return [void]
129
+ # @example
130
+ # LanguageRegistry.clear_all!
131
+ def clear_all!
132
+ @mutex.synchronize do
133
+ @registrations.clear
134
+ @cache.clear
135
+ end
136
+ nil
137
+ end
138
+ end
139
+ end