woods 1.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (185) hide show
  1. checksums.yaml +7 -0
  2. data/CHANGELOG.md +89 -0
  3. data/CODE_OF_CONDUCT.md +83 -0
  4. data/CONTRIBUTING.md +65 -0
  5. data/LICENSE.txt +21 -0
  6. data/README.md +406 -0
  7. data/exe/woods-console +59 -0
  8. data/exe/woods-console-mcp +22 -0
  9. data/exe/woods-mcp +34 -0
  10. data/exe/woods-mcp-http +37 -0
  11. data/exe/woods-mcp-start +58 -0
  12. data/lib/generators/woods/install_generator.rb +32 -0
  13. data/lib/generators/woods/pgvector_generator.rb +37 -0
  14. data/lib/generators/woods/templates/add_pgvector_to_woods.rb.erb +15 -0
  15. data/lib/generators/woods/templates/create_woods_tables.rb.erb +43 -0
  16. data/lib/tasks/woods.rake +621 -0
  17. data/lib/tasks/woods_evaluation.rake +115 -0
  18. data/lib/woods/ast/call_site_extractor.rb +106 -0
  19. data/lib/woods/ast/method_extractor.rb +71 -0
  20. data/lib/woods/ast/node.rb +116 -0
  21. data/lib/woods/ast/parser.rb +614 -0
  22. data/lib/woods/ast.rb +6 -0
  23. data/lib/woods/builder.rb +200 -0
  24. data/lib/woods/cache/cache_middleware.rb +199 -0
  25. data/lib/woods/cache/cache_store.rb +264 -0
  26. data/lib/woods/cache/redis_cache_store.rb +116 -0
  27. data/lib/woods/cache/solid_cache_store.rb +111 -0
  28. data/lib/woods/chunking/chunk.rb +84 -0
  29. data/lib/woods/chunking/semantic_chunker.rb +295 -0
  30. data/lib/woods/console/adapters/cache_adapter.rb +58 -0
  31. data/lib/woods/console/adapters/good_job_adapter.rb +33 -0
  32. data/lib/woods/console/adapters/job_adapter.rb +68 -0
  33. data/lib/woods/console/adapters/sidekiq_adapter.rb +33 -0
  34. data/lib/woods/console/adapters/solid_queue_adapter.rb +33 -0
  35. data/lib/woods/console/audit_logger.rb +75 -0
  36. data/lib/woods/console/bridge.rb +177 -0
  37. data/lib/woods/console/confirmation.rb +90 -0
  38. data/lib/woods/console/connection_manager.rb +173 -0
  39. data/lib/woods/console/console_response_renderer.rb +74 -0
  40. data/lib/woods/console/embedded_executor.rb +373 -0
  41. data/lib/woods/console/model_validator.rb +81 -0
  42. data/lib/woods/console/rack_middleware.rb +87 -0
  43. data/lib/woods/console/safe_context.rb +82 -0
  44. data/lib/woods/console/server.rb +612 -0
  45. data/lib/woods/console/sql_validator.rb +172 -0
  46. data/lib/woods/console/tools/tier1.rb +118 -0
  47. data/lib/woods/console/tools/tier2.rb +117 -0
  48. data/lib/woods/console/tools/tier3.rb +110 -0
  49. data/lib/woods/console/tools/tier4.rb +79 -0
  50. data/lib/woods/coordination/pipeline_lock.rb +109 -0
  51. data/lib/woods/cost_model/embedding_cost.rb +88 -0
  52. data/lib/woods/cost_model/estimator.rb +128 -0
  53. data/lib/woods/cost_model/provider_pricing.rb +67 -0
  54. data/lib/woods/cost_model/storage_cost.rb +52 -0
  55. data/lib/woods/cost_model.rb +22 -0
  56. data/lib/woods/db/migrations/001_create_units.rb +38 -0
  57. data/lib/woods/db/migrations/002_create_edges.rb +35 -0
  58. data/lib/woods/db/migrations/003_create_embeddings.rb +37 -0
  59. data/lib/woods/db/migrations/004_create_snapshots.rb +45 -0
  60. data/lib/woods/db/migrations/005_create_snapshot_units.rb +40 -0
  61. data/lib/woods/db/migrations/006_rename_tables.rb +34 -0
  62. data/lib/woods/db/migrator.rb +73 -0
  63. data/lib/woods/db/schema_version.rb +73 -0
  64. data/lib/woods/dependency_graph.rb +236 -0
  65. data/lib/woods/embedding/indexer.rb +140 -0
  66. data/lib/woods/embedding/openai.rb +126 -0
  67. data/lib/woods/embedding/provider.rb +162 -0
  68. data/lib/woods/embedding/text_preparer.rb +112 -0
  69. data/lib/woods/evaluation/baseline_runner.rb +115 -0
  70. data/lib/woods/evaluation/evaluator.rb +139 -0
  71. data/lib/woods/evaluation/metrics.rb +79 -0
  72. data/lib/woods/evaluation/query_set.rb +148 -0
  73. data/lib/woods/evaluation/report_generator.rb +90 -0
  74. data/lib/woods/extracted_unit.rb +145 -0
  75. data/lib/woods/extractor.rb +1028 -0
  76. data/lib/woods/extractors/action_cable_extractor.rb +201 -0
  77. data/lib/woods/extractors/ast_source_extraction.rb +46 -0
  78. data/lib/woods/extractors/behavioral_profile.rb +309 -0
  79. data/lib/woods/extractors/caching_extractor.rb +261 -0
  80. data/lib/woods/extractors/callback_analyzer.rb +246 -0
  81. data/lib/woods/extractors/concern_extractor.rb +292 -0
  82. data/lib/woods/extractors/configuration_extractor.rb +219 -0
  83. data/lib/woods/extractors/controller_extractor.rb +404 -0
  84. data/lib/woods/extractors/database_view_extractor.rb +278 -0
  85. data/lib/woods/extractors/decorator_extractor.rb +253 -0
  86. data/lib/woods/extractors/engine_extractor.rb +223 -0
  87. data/lib/woods/extractors/event_extractor.rb +211 -0
  88. data/lib/woods/extractors/factory_extractor.rb +289 -0
  89. data/lib/woods/extractors/graphql_extractor.rb +892 -0
  90. data/lib/woods/extractors/i18n_extractor.rb +117 -0
  91. data/lib/woods/extractors/job_extractor.rb +374 -0
  92. data/lib/woods/extractors/lib_extractor.rb +218 -0
  93. data/lib/woods/extractors/mailer_extractor.rb +269 -0
  94. data/lib/woods/extractors/manager_extractor.rb +188 -0
  95. data/lib/woods/extractors/middleware_extractor.rb +133 -0
  96. data/lib/woods/extractors/migration_extractor.rb +469 -0
  97. data/lib/woods/extractors/model_extractor.rb +988 -0
  98. data/lib/woods/extractors/phlex_extractor.rb +252 -0
  99. data/lib/woods/extractors/policy_extractor.rb +191 -0
  100. data/lib/woods/extractors/poro_extractor.rb +229 -0
  101. data/lib/woods/extractors/pundit_extractor.rb +223 -0
  102. data/lib/woods/extractors/rails_source_extractor.rb +473 -0
  103. data/lib/woods/extractors/rake_task_extractor.rb +343 -0
  104. data/lib/woods/extractors/route_extractor.rb +181 -0
  105. data/lib/woods/extractors/scheduled_job_extractor.rb +331 -0
  106. data/lib/woods/extractors/serializer_extractor.rb +339 -0
  107. data/lib/woods/extractors/service_extractor.rb +217 -0
  108. data/lib/woods/extractors/shared_dependency_scanner.rb +91 -0
  109. data/lib/woods/extractors/shared_utility_methods.rb +281 -0
  110. data/lib/woods/extractors/state_machine_extractor.rb +398 -0
  111. data/lib/woods/extractors/test_mapping_extractor.rb +225 -0
  112. data/lib/woods/extractors/validator_extractor.rb +211 -0
  113. data/lib/woods/extractors/view_component_extractor.rb +311 -0
  114. data/lib/woods/extractors/view_template_extractor.rb +261 -0
  115. data/lib/woods/feedback/gap_detector.rb +89 -0
  116. data/lib/woods/feedback/store.rb +119 -0
  117. data/lib/woods/filename_utils.rb +32 -0
  118. data/lib/woods/flow_analysis/operation_extractor.rb +206 -0
  119. data/lib/woods/flow_analysis/response_code_mapper.rb +154 -0
  120. data/lib/woods/flow_assembler.rb +290 -0
  121. data/lib/woods/flow_document.rb +191 -0
  122. data/lib/woods/flow_precomputer.rb +102 -0
  123. data/lib/woods/formatting/base.rb +30 -0
  124. data/lib/woods/formatting/claude_adapter.rb +98 -0
  125. data/lib/woods/formatting/generic_adapter.rb +56 -0
  126. data/lib/woods/formatting/gpt_adapter.rb +64 -0
  127. data/lib/woods/formatting/human_adapter.rb +78 -0
  128. data/lib/woods/graph_analyzer.rb +374 -0
  129. data/lib/woods/mcp/bootstrapper.rb +96 -0
  130. data/lib/woods/mcp/index_reader.rb +394 -0
  131. data/lib/woods/mcp/renderers/claude_renderer.rb +81 -0
  132. data/lib/woods/mcp/renderers/json_renderer.rb +17 -0
  133. data/lib/woods/mcp/renderers/markdown_renderer.rb +353 -0
  134. data/lib/woods/mcp/renderers/plain_renderer.rb +240 -0
  135. data/lib/woods/mcp/server.rb +962 -0
  136. data/lib/woods/mcp/tool_response_renderer.rb +85 -0
  137. data/lib/woods/model_name_cache.rb +51 -0
  138. data/lib/woods/notion/client.rb +217 -0
  139. data/lib/woods/notion/exporter.rb +219 -0
  140. data/lib/woods/notion/mapper.rb +40 -0
  141. data/lib/woods/notion/mappers/column_mapper.rb +57 -0
  142. data/lib/woods/notion/mappers/migration_mapper.rb +39 -0
  143. data/lib/woods/notion/mappers/model_mapper.rb +161 -0
  144. data/lib/woods/notion/mappers/shared.rb +22 -0
  145. data/lib/woods/notion/rate_limiter.rb +68 -0
  146. data/lib/woods/observability/health_check.rb +79 -0
  147. data/lib/woods/observability/instrumentation.rb +34 -0
  148. data/lib/woods/observability/structured_logger.rb +57 -0
  149. data/lib/woods/operator/error_escalator.rb +81 -0
  150. data/lib/woods/operator/pipeline_guard.rb +92 -0
  151. data/lib/woods/operator/status_reporter.rb +80 -0
  152. data/lib/woods/railtie.rb +38 -0
  153. data/lib/woods/resilience/circuit_breaker.rb +99 -0
  154. data/lib/woods/resilience/index_validator.rb +167 -0
  155. data/lib/woods/resilience/retryable_provider.rb +108 -0
  156. data/lib/woods/retrieval/context_assembler.rb +261 -0
  157. data/lib/woods/retrieval/query_classifier.rb +133 -0
  158. data/lib/woods/retrieval/ranker.rb +277 -0
  159. data/lib/woods/retrieval/search_executor.rb +316 -0
  160. data/lib/woods/retriever.rb +152 -0
  161. data/lib/woods/ruby_analyzer/class_analyzer.rb +170 -0
  162. data/lib/woods/ruby_analyzer/dataflow_analyzer.rb +77 -0
  163. data/lib/woods/ruby_analyzer/fqn_builder.rb +18 -0
  164. data/lib/woods/ruby_analyzer/mermaid_renderer.rb +280 -0
  165. data/lib/woods/ruby_analyzer/method_analyzer.rb +143 -0
  166. data/lib/woods/ruby_analyzer/trace_enricher.rb +143 -0
  167. data/lib/woods/ruby_analyzer.rb +87 -0
  168. data/lib/woods/session_tracer/file_store.rb +104 -0
  169. data/lib/woods/session_tracer/middleware.rb +143 -0
  170. data/lib/woods/session_tracer/redis_store.rb +106 -0
  171. data/lib/woods/session_tracer/session_flow_assembler.rb +254 -0
  172. data/lib/woods/session_tracer/session_flow_document.rb +223 -0
  173. data/lib/woods/session_tracer/solid_cache_store.rb +139 -0
  174. data/lib/woods/session_tracer/store.rb +81 -0
  175. data/lib/woods/storage/graph_store.rb +120 -0
  176. data/lib/woods/storage/metadata_store.rb +196 -0
  177. data/lib/woods/storage/pgvector.rb +195 -0
  178. data/lib/woods/storage/qdrant.rb +205 -0
  179. data/lib/woods/storage/vector_store.rb +167 -0
  180. data/lib/woods/temporal/json_snapshot_store.rb +245 -0
  181. data/lib/woods/temporal/snapshot_store.rb +345 -0
  182. data/lib/woods/token_utils.rb +19 -0
  183. data/lib/woods/version.rb +5 -0
  184. data/lib/woods.rb +246 -0
  185. metadata +270 -0
@@ -0,0 +1,345 @@
1
+ # frozen_string_literal: true
2
+
3
+ require 'json'
4
+ require 'time'
5
+
6
+ module Woods
7
+ module Temporal
8
+ # SnapshotStore captures and queries temporal snapshots of extraction runs.
9
+ #
10
+ # Each snapshot is anchored to a git commit SHA and stores per-unit content
11
+ # hashes for efficient diff computation. Full source is not duplicated —
12
+ # only hashes of source, metadata, and dependencies are stored per snapshot.
13
+ #
14
+ # @example Capturing a snapshot
15
+ # store = SnapshotStore.new(connection: db)
16
+ # store.capture(manifest, unit_hashes)
17
+ #
18
+ # @example Comparing snapshots
19
+ # diff = store.diff("abc123", "def456")
20
+ # diff[:added] # => [{ identifier: "NewModel", ... }]
21
+ # diff[:modified] # => [{ identifier: "User", ... }]
22
+ # diff[:deleted] # => [{ identifier: "OldService", ... }]
23
+ #
24
+ class SnapshotStore # rubocop:disable Metrics/ClassLength
25
+ # @param connection [Object] Database connection supporting #execute and #get_first_row
26
+ def initialize(connection:)
27
+ @db = connection
28
+ end
29
+
30
+ # Capture a snapshot after extraction completes.
31
+ #
32
+ # Stores the manifest metadata and per-unit content hashes.
33
+ # Computes diff stats vs. the most recent previous snapshot.
34
+ #
35
+ # @param manifest [Hash] The manifest data (string or symbol keys)
36
+ # @param unit_hashes [Array<Hash>] Per-unit content hashes
37
+ # @return [Hash] Snapshot record with diff stats
38
+ def capture(manifest, unit_hashes)
39
+ git_sha = mget(manifest, 'git_sha')
40
+ return nil unless git_sha
41
+
42
+ previous = find_latest
43
+ upsert_snapshot(manifest, git_sha, unit_hashes.size)
44
+
45
+ snapshot_id = fetch_snapshot_id(git_sha)
46
+ @db.execute('DELETE FROM woods_snapshot_units WHERE snapshot_id = ?', [snapshot_id])
47
+ insert_unit_hashes(snapshot_id, unit_hashes)
48
+
49
+ update_diff_stats(snapshot_id, previous)
50
+ find(git_sha)
51
+ end
52
+
53
+ # List snapshots, optionally filtered by branch.
54
+ #
55
+ # @param limit [Integer] Max results (default 20)
56
+ # @param branch [String, nil] Filter by branch name
57
+ # @return [Array<Hash>] Snapshot summaries sorted by extracted_at descending
58
+ def list(limit: 20, branch: nil)
59
+ rows = if branch
60
+ @db.execute(
61
+ 'SELECT * FROM woods_snapshots WHERE git_branch = ? ORDER BY extracted_at DESC LIMIT ?',
62
+ [branch, limit]
63
+ )
64
+ else
65
+ @db.execute(
66
+ 'SELECT * FROM woods_snapshots ORDER BY extracted_at DESC LIMIT ?',
67
+ [limit]
68
+ )
69
+ end
70
+
71
+ rows.map { |row| row_to_hash(row) }
72
+ end
73
+
74
+ # Find a specific snapshot by git SHA.
75
+ #
76
+ # @param git_sha [String]
77
+ # @return [Hash, nil] Snapshot metadata or nil if not found
78
+ def find(git_sha)
79
+ row = @db.get_first_row('SELECT * FROM woods_snapshots WHERE git_sha = ?', [git_sha])
80
+ return nil unless row
81
+
82
+ row_to_hash(row)
83
+ end
84
+
85
+ # Compute diff between two snapshots.
86
+ #
87
+ # @param sha_a [String] Before snapshot git SHA
88
+ # @param sha_b [String] After snapshot git SHA
89
+ # @return [Hash] {added: [...], modified: [...], deleted: [...]}
90
+ def diff(sha_a, sha_b)
91
+ id_a = fetch_snapshot_id(sha_a)
92
+ id_b = fetch_snapshot_id(sha_b)
93
+
94
+ return { added: [], modified: [], deleted: [] } unless id_a && id_b
95
+
96
+ units_a = load_snapshot_units(id_a)
97
+ units_b = load_snapshot_units(id_b)
98
+
99
+ compute_diff(units_a, units_b)
100
+ end
101
+
102
+ # History of a single unit across snapshots.
103
+ #
104
+ # @param identifier [String] Unit identifier
105
+ # @param limit [Integer] Max snapshots to return (default 20)
106
+ # @return [Array<Hash>] Entries with git_sha, extracted_at, source_hash, changed flag
107
+ def unit_history(identifier, limit: 20)
108
+ rows = @db.execute(<<~SQL, [identifier, limit])
109
+ SELECT su.source_hash, su.metadata_hash, su.dependencies_hash, su.unit_type,
110
+ s.git_sha, s.extracted_at, s.git_branch
111
+ FROM woods_snapshot_units su
112
+ JOIN woods_snapshots s ON s.id = su.snapshot_id
113
+ WHERE su.identifier = ?
114
+ ORDER BY s.extracted_at DESC
115
+ LIMIT ?
116
+ SQL
117
+
118
+ entries = rows.map { |row| history_entry_from_row(row) }
119
+ mark_changed_entries(entries)
120
+ end
121
+
122
+ private
123
+
124
+ # Build a history entry hash from a database row.
125
+ #
126
+ # @param row [Hash]
127
+ # @return [Hash]
128
+ def history_entry_from_row(row)
129
+ {
130
+ git_sha: row['git_sha'],
131
+ extracted_at: row['extracted_at'],
132
+ git_branch: row['git_branch'],
133
+ unit_type: row['unit_type'],
134
+ source_hash: row['source_hash'],
135
+ metadata_hash: row['metadata_hash'],
136
+ dependencies_hash: row['dependencies_hash']
137
+ }
138
+ end
139
+
140
+ # Mark changed flag on history entries by comparing source hashes.
141
+ #
142
+ # @param entries [Array<Hash>]
143
+ # @return [Array<Hash>]
144
+ def mark_changed_entries(entries)
145
+ entries.each_with_index do |entry, i|
146
+ entry[:changed] = if i == entries.size - 1
147
+ true # Oldest version is always "changed" (first appearance)
148
+ else
149
+ entry[:source_hash] != entries[i + 1][:source_hash]
150
+ end
151
+ end
152
+ entries
153
+ end
154
+
155
+ # Get a value from a hash that may have string or symbol keys.
156
+ #
157
+ # @param hash [Hash]
158
+ # @param key [String]
159
+ # @return [Object, nil]
160
+ def mget(hash, key)
161
+ hash[key] || hash[key.to_sym]
162
+ end
163
+
164
+ # Insert or replace the snapshot row from manifest data.
165
+ #
166
+ # @param manifest [Hash]
167
+ # @param git_sha [String]
168
+ # @param default_total [Integer]
169
+ # @return [void]
170
+ def upsert_snapshot(manifest, git_sha, default_total)
171
+ params = [
172
+ git_sha,
173
+ mget(manifest, 'git_branch'),
174
+ mget(manifest, 'extracted_at') || Time.now.iso8601,
175
+ mget(manifest, 'rails_version'),
176
+ mget(manifest, 'ruby_version'),
177
+ mget(manifest, 'total_units') || default_total,
178
+ JSON.generate(mget(manifest, 'counts') || {}),
179
+ mget(manifest, 'gemfile_lock_sha'),
180
+ mget(manifest, 'schema_sha')
181
+ ]
182
+ @db.execute(<<~SQL, params)
183
+ INSERT OR REPLACE INTO woods_snapshots
184
+ (git_sha, git_branch, extracted_at, rails_version, ruby_version,
185
+ total_units, unit_counts, gemfile_lock_sha, schema_sha)
186
+ VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?)
187
+ SQL
188
+ end
189
+
190
+ # Update a snapshot's diff stats vs. a previous snapshot.
191
+ #
192
+ # @param snapshot_id [Integer]
193
+ # @param previous [Hash, nil]
194
+ # @return [void]
195
+ def update_diff_stats(snapshot_id, previous)
196
+ diff_stats = compute_diff_stats(snapshot_id, previous)
197
+ @db.execute(
198
+ 'UPDATE woods_snapshots SET units_added = ?, units_modified = ?, units_deleted = ? WHERE id = ?',
199
+ [diff_stats[:added], diff_stats[:modified], diff_stats[:deleted], snapshot_id]
200
+ )
201
+ end
202
+
203
+ # Find the most recent snapshot.
204
+ #
205
+ # @return [Hash, nil]
206
+ def find_latest
207
+ row = @db.get_first_row('SELECT * FROM woods_snapshots ORDER BY extracted_at DESC LIMIT 1')
208
+ return nil unless row
209
+
210
+ row_to_hash(row)
211
+ end
212
+
213
+ # Fetch a snapshot's ID by git SHA.
214
+ #
215
+ # @param git_sha [String]
216
+ # @return [Integer, nil]
217
+ def fetch_snapshot_id(git_sha)
218
+ @db.get_first_value('SELECT id FROM woods_snapshots WHERE git_sha = ?', [git_sha])
219
+ end
220
+
221
+ # Insert per-unit hash records for a snapshot.
222
+ #
223
+ # @param snapshot_id [Integer]
224
+ # @param unit_hashes [Array<Hash>]
225
+ # @return [void]
226
+ def insert_unit_hashes(snapshot_id, unit_hashes)
227
+ sql = <<~SQL
228
+ INSERT INTO woods_snapshot_units
229
+ (snapshot_id, identifier, unit_type, source_hash, metadata_hash, dependencies_hash)
230
+ VALUES (?, ?, ?, ?, ?, ?)
231
+ SQL
232
+
233
+ # Wrap in a transaction to batch all inserts into a single commit,
234
+ # reducing per-row fsync overhead from O(n) to O(1).
235
+ @db.transaction do
236
+ unit_hashes.each do |uh|
237
+ params = [
238
+ snapshot_id,
239
+ mget(uh, 'identifier'),
240
+ mget(uh, 'type').to_s,
241
+ mget(uh, 'source_hash'),
242
+ mget(uh, 'metadata_hash'),
243
+ mget(uh, 'dependencies_hash')
244
+ ]
245
+ @db.execute(sql, params)
246
+ end
247
+ end
248
+ end
249
+
250
+ # Load all unit records for a snapshot as a hash keyed by identifier.
251
+ #
252
+ # @param snapshot_id [Integer]
253
+ # @return [Hash{String => Hash}]
254
+ def load_snapshot_units(snapshot_id)
255
+ sql = <<~SQL
256
+ SELECT identifier, unit_type, source_hash, metadata_hash, dependencies_hash
257
+ FROM woods_snapshot_units WHERE snapshot_id = ?
258
+ SQL
259
+ rows = @db.execute(sql, [snapshot_id])
260
+
261
+ rows.to_h do |row|
262
+ [row['identifier'], {
263
+ unit_type: row['unit_type'],
264
+ source_hash: row['source_hash'],
265
+ metadata_hash: row['metadata_hash'],
266
+ dependencies_hash: row['dependencies_hash']
267
+ }]
268
+ end
269
+ end
270
+
271
+ # Compute diff between two sets of unit hashes.
272
+ #
273
+ # @param units_a [Hash{String => Hash}] Before
274
+ # @param units_b [Hash{String => Hash}] After
275
+ # @return [Hash] {added: [...], modified: [...], deleted: [...]}
276
+ def compute_diff(units_a, units_b) # rubocop:disable Metrics/CyclomaticComplexity,Metrics/PerceivedComplexity
277
+ added = []
278
+ modified = []
279
+ deleted = []
280
+
281
+ # Units in B but not A → added
282
+ # Units in both → check for modifications
283
+ units_b.each do |identifier, data_b|
284
+ if units_a.key?(identifier)
285
+ data_a = units_a[identifier]
286
+ if data_a[:source_hash] != data_b[:source_hash] ||
287
+ data_a[:metadata_hash] != data_b[:metadata_hash] ||
288
+ data_a[:dependencies_hash] != data_b[:dependencies_hash]
289
+ modified << { identifier: identifier, unit_type: data_b[:unit_type] }
290
+ end
291
+ else
292
+ added << { identifier: identifier, unit_type: data_b[:unit_type] }
293
+ end
294
+ end
295
+
296
+ # Units in A but not B → deleted
297
+ units_a.each do |identifier, data_a|
298
+ deleted << { identifier: identifier, unit_type: data_a[:unit_type] } unless units_b.key?(identifier)
299
+ end
300
+
301
+ { added: added, modified: modified, deleted: deleted }
302
+ end
303
+
304
+ # Compute aggregate diff stats.
305
+ #
306
+ # @param current_snapshot_id [Integer]
307
+ # @param previous_snapshot [Hash, nil]
308
+ # @return [Hash] {added:, modified:, deleted:}
309
+ def compute_diff_stats(current_snapshot_id, previous_snapshot)
310
+ return { added: 0, modified: 0, deleted: 0 } unless previous_snapshot
311
+
312
+ prev_id = fetch_snapshot_id(previous_snapshot[:git_sha])
313
+ return { added: 0, modified: 0, deleted: 0 } unless prev_id
314
+
315
+ units_prev = load_snapshot_units(prev_id)
316
+ units_curr = load_snapshot_units(current_snapshot_id)
317
+
318
+ result = compute_diff(units_prev, units_curr)
319
+ { added: result[:added].size, modified: result[:modified].size, deleted: result[:deleted].size }
320
+ end
321
+
322
+ # Convert a database row to a normalized hash.
323
+ #
324
+ # @param row [Hash] SQLite3 result row
325
+ # @return [Hash]
326
+ def row_to_hash(row)
327
+ {
328
+ id: row['id'],
329
+ git_sha: row['git_sha'],
330
+ git_branch: row['git_branch'],
331
+ extracted_at: row['extracted_at'],
332
+ rails_version: row['rails_version'],
333
+ ruby_version: row['ruby_version'],
334
+ total_units: row['total_units'],
335
+ unit_counts: row['unit_counts'] ? JSON.parse(row['unit_counts']) : {},
336
+ gemfile_lock_sha: row['gemfile_lock_sha'],
337
+ schema_sha: row['schema_sha'],
338
+ units_added: row['units_added'],
339
+ units_modified: row['units_modified'],
340
+ units_deleted: row['units_deleted']
341
+ }
342
+ end
343
+ end
344
+ end
345
+ end
@@ -0,0 +1,19 @@
1
+ # frozen_string_literal: true
2
+
3
+ module Woods
4
+ # Shared token estimation utility.
5
+ #
6
+ # Uses project convention: (string.length / 4.0).ceil
7
+ # See docs/TOKEN_BENCHMARK.md — conservative floor (~10.6% overestimate).
8
+ module TokenUtils
9
+ module_function
10
+
11
+ # Estimate token count for a string.
12
+ #
13
+ # @param text [String] Text to estimate
14
+ # @return [Integer] Estimated token count
15
+ def estimate_tokens(text)
16
+ (text.length / 4.0).ceil
17
+ end
18
+ end
19
+ end
@@ -0,0 +1,5 @@
1
+ # frozen_string_literal: true
2
+
3
+ module Woods
4
+ VERSION = '1.0.0'
5
+ end
data/lib/woods.rb ADDED
@@ -0,0 +1,246 @@
1
+ # frozen_string_literal: true
2
+
3
+ # Woods - Rails Codebase Indexing and Retrieval
4
+ #
5
+ # A system for extracting, indexing, and retrieving context from Rails codebases
6
+ # to enable AI-assisted development, debugging, and analytics.
7
+ #
8
+ # ## Quick Start
9
+ #
10
+ # # Extract codebase
11
+ # Woods.extract!
12
+ #
13
+ # # Or via rake
14
+ # bundle exec rake woods:extract
15
+ #
16
+ # ## Configuration
17
+ #
18
+ # Woods.configure do |config|
19
+ # config.output_dir = Rails.root.join("tmp/woods")
20
+ # config.max_context_tokens = 8000
21
+ # config.include_framework_sources = true
22
+ # end
23
+ #
24
+ require_relative 'woods/version'
25
+
26
+ module Woods
27
+ class Error < StandardError; end
28
+ class ConfigurationError < Error; end
29
+ class ExtractionError < Error; end
30
+ class SessionTracerError < Error; end
31
+
32
+ CONFIG_MUTEX = Mutex.new
33
+
34
+ # ════════════════════════════════════════════════════════════════════════
35
+ # Configuration
36
+ # ════════════════════════════════════════════════════════════════════════
37
+
38
+ class Configuration
39
+ attr_accessor :embedding_model, :include_framework_sources, :gem_configs,
40
+ :vector_store, :metadata_store, :graph_store, :embedding_provider, :log_level,
41
+ :vector_store_options, :metadata_store_options, :embedding_options,
42
+ :concurrent_extraction, :precompute_flows, :enable_snapshots,
43
+ :session_tracer_enabled, :session_store, :session_id_proc, :session_exclude_paths,
44
+ :console_mcp_enabled, :console_mcp_path, :console_redacted_columns,
45
+ :notion_api_token, :notion_database_ids,
46
+ :cache_store, :cache_options
47
+ attr_reader :max_context_tokens, :similarity_threshold, :extractors, :pretty_json, :context_format,
48
+ :cache_enabled
49
+
50
+ def initialize # rubocop:disable Metrics/MethodLength
51
+ @output_dir = nil # Resolved lazily; Rails.root is nil at require time
52
+ @embedding_model = 'text-embedding-3-small'
53
+ @max_context_tokens = 8000
54
+ @similarity_threshold = 0.7
55
+ @include_framework_sources = true
56
+ @gem_configs = {}
57
+ @extractors = %i[models controllers services components view_components jobs mailers graphql serializers
58
+ managers policies validators rails_source]
59
+ @pretty_json = true
60
+ @concurrent_extraction = false
61
+ @precompute_flows = false
62
+ @enable_snapshots = false
63
+ @context_format = :markdown
64
+ @session_tracer_enabled = false
65
+ @session_store = nil
66
+ @session_id_proc = nil
67
+ @session_exclude_paths = []
68
+ @console_mcp_enabled = false
69
+ @console_mcp_path = '/mcp/console'
70
+ @console_redacted_columns = []
71
+ @notion_api_token = nil
72
+ @notion_database_ids = {}
73
+ @cache_enabled = false
74
+ @cache_store = nil # :redis, :solid_cache, :memory, or a CacheStore instance
75
+ @cache_options = {} # { redis: client, cache: store, ttl: { embeddings: 86400, ... } }
76
+ end
77
+
78
+ # @return [Pathname, String] Output directory, defaulting to Rails.root/tmp/woods
79
+ def output_dir
80
+ @output_dir ||= defined?(Rails) && Rails.root ? Rails.root.join('tmp/woods') : 'tmp/woods'
81
+ end
82
+
83
+ # @param value [Object] Must respond to #to_s
84
+ # @raise [ConfigurationError] if value is nil
85
+ def output_dir=(value)
86
+ raise ConfigurationError, 'output_dir cannot be nil' if value.nil?
87
+
88
+ @output_dir = value
89
+ end
90
+
91
+ # @param value [Integer] Must be a positive Integer
92
+ # @raise [ConfigurationError] if value is not a positive Integer
93
+ def max_context_tokens=(value)
94
+ unless value.is_a?(Integer) && value.positive?
95
+ raise ConfigurationError, "max_context_tokens must be a positive Integer, got #{value.inspect}"
96
+ end
97
+
98
+ @max_context_tokens = value
99
+ end
100
+
101
+ # @param value [Numeric] Must be between 0.0 and 1.0 inclusive
102
+ # @raise [ConfigurationError] if value is out of range or not numeric
103
+ def similarity_threshold=(value)
104
+ raise ConfigurationError, "similarity_threshold must be Numeric, got #{value.inspect}" unless value.is_a?(Numeric)
105
+
106
+ float_val = value.to_f
107
+ unless float_val.between?(0.0, 1.0)
108
+ raise ConfigurationError, "similarity_threshold must be between 0.0 and 1.0, got #{value.inspect}"
109
+ end
110
+
111
+ @similarity_threshold = float_val
112
+ end
113
+
114
+ # @param value [Array<Symbol>] List of extractor names
115
+ # @raise [ConfigurationError] if value is not an Array of Symbols
116
+ def extractors=(value)
117
+ unless value.is_a?(Array) && value.all?(Symbol)
118
+ raise ConfigurationError, "extractors must be an Array of Symbols, got #{value.inspect}"
119
+ end
120
+
121
+ @extractors = value
122
+ end
123
+
124
+ # @param value [Boolean] Must be true or false
125
+ # @raise [ConfigurationError] if value is not a boolean
126
+ def pretty_json=(value)
127
+ validate_boolean!(:pretty_json, value)
128
+ @pretty_json = value
129
+ end
130
+
131
+ # @param value [Symbol] Must be one of :claude, :markdown, :plain, :json
132
+ # @raise [ConfigurationError] if value is not a valid format
133
+ def context_format=(value)
134
+ valid = %i[claude markdown plain json]
135
+ unless valid.include?(value)
136
+ raise ConfigurationError, "context_format must be one of #{valid.inspect}, got #{value.inspect}"
137
+ end
138
+
139
+ @context_format = value
140
+ end
141
+
142
+ # @param value [Boolean] Enable or disable the cache layer
143
+ # @raise [ConfigurationError] if value is not a boolean
144
+ def cache_enabled=(value)
145
+ validate_boolean!(:cache_enabled, value)
146
+ @cache_enabled = value
147
+ end
148
+
149
+ # Add a gem to be indexed
150
+ #
151
+ # @param gem_name [String] Name of the gem
152
+ # @param paths [Array<String>] Relative paths within the gem to index
153
+ # @param priority [Symbol] :high, :medium, or :low
154
+ def add_gem(gem_name, paths:, priority: :medium)
155
+ @gem_configs[gem_name] = { paths: paths, priority: priority }
156
+ end
157
+
158
+ private
159
+
160
+ def validate_boolean!(name, value)
161
+ return if value.is_a?(TrueClass) || value.is_a?(FalseClass)
162
+
163
+ raise ConfigurationError, "#{name} must be true or false, got #{value.inspect}"
164
+ end
165
+ end
166
+
167
+ # ════════════════════════════════════════════════════════════════════════
168
+ # Module Interface
169
+ # ════════════════════════════════════════════════════════════════════════
170
+
171
+ class << self
172
+ attr_accessor :configuration
173
+
174
+ def configure
175
+ CONFIG_MUTEX.synchronize do
176
+ self.configuration ||= Configuration.new
177
+ yield(configuration) if block_given?
178
+ configuration
179
+ end
180
+ end
181
+
182
+ # Configure the module using a named preset and optional block customization.
183
+ #
184
+ # Valid preset names: :local, :postgresql, :production
185
+ #
186
+ # @param name [Symbol] Preset name
187
+ # @yield [config] Optional block for further customization after preset is applied
188
+ # @yieldparam config [Configuration] The configuration object
189
+ # @return [Configuration] The applied configuration
190
+ def configure_with_preset(name)
191
+ CONFIG_MUTEX.synchronize do
192
+ self.configuration = Builder.preset_config(name)
193
+ yield configuration if block_given?
194
+ configuration
195
+ end
196
+ end
197
+
198
+ # Build a Retriever wired with adapters from the current configuration.
199
+ #
200
+ # @return [Retriever] A fully wired retriever instance
201
+ def build_retriever
202
+ Builder.new(configuration).build_retriever
203
+ end
204
+
205
+ # Retrieve context for a natural language query using the current configuration.
206
+ #
207
+ # @param query [String] Natural language query
208
+ # @param opts [Hash] Options passed through to the retriever (e.g., budget:)
209
+ # @return [Retriever::RetrievalResult] Retrieval result
210
+ def retrieve(query, **opts)
211
+ build_retriever.retrieve(query, **opts)
212
+ end
213
+
214
+ # Perform full extraction
215
+ #
216
+ # @param output_dir [String] Override output directory
217
+ # @return [Hash] Extraction results
218
+ def extract!(output_dir: nil)
219
+ require_relative 'woods/extractor'
220
+
221
+ dir = output_dir || configuration.output_dir
222
+ extractor = Extractor.new(output_dir: dir)
223
+ extractor.extract_all
224
+ end
225
+
226
+ # Perform incremental extraction
227
+ #
228
+ # @param changed_files [Array<String>] List of changed files
229
+ # @return [Array<String>] Re-extracted unit identifiers
230
+ def extract_changed!(changed_files)
231
+ require_relative 'woods/extractor'
232
+
233
+ extractor = Extractor.new(output_dir: configuration.output_dir)
234
+ extractor.extract_changed(changed_files)
235
+ end
236
+ end
237
+
238
+ # Initialize with defaults
239
+ configure
240
+ end
241
+
242
+ require_relative 'woods/builder'
243
+ require_relative 'woods/cost_model'
244
+ require_relative 'woods/cache/cache_store'
245
+ require_relative 'woods/cache/cache_middleware'
246
+ require_relative 'woods/railtie' if defined?(Rails::Railtie)