woods 1.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (185) hide show
  1. checksums.yaml +7 -0
  2. data/CHANGELOG.md +89 -0
  3. data/CODE_OF_CONDUCT.md +83 -0
  4. data/CONTRIBUTING.md +65 -0
  5. data/LICENSE.txt +21 -0
  6. data/README.md +406 -0
  7. data/exe/woods-console +59 -0
  8. data/exe/woods-console-mcp +22 -0
  9. data/exe/woods-mcp +34 -0
  10. data/exe/woods-mcp-http +37 -0
  11. data/exe/woods-mcp-start +58 -0
  12. data/lib/generators/woods/install_generator.rb +32 -0
  13. data/lib/generators/woods/pgvector_generator.rb +37 -0
  14. data/lib/generators/woods/templates/add_pgvector_to_woods.rb.erb +15 -0
  15. data/lib/generators/woods/templates/create_woods_tables.rb.erb +43 -0
  16. data/lib/tasks/woods.rake +621 -0
  17. data/lib/tasks/woods_evaluation.rake +115 -0
  18. data/lib/woods/ast/call_site_extractor.rb +106 -0
  19. data/lib/woods/ast/method_extractor.rb +71 -0
  20. data/lib/woods/ast/node.rb +116 -0
  21. data/lib/woods/ast/parser.rb +614 -0
  22. data/lib/woods/ast.rb +6 -0
  23. data/lib/woods/builder.rb +200 -0
  24. data/lib/woods/cache/cache_middleware.rb +199 -0
  25. data/lib/woods/cache/cache_store.rb +264 -0
  26. data/lib/woods/cache/redis_cache_store.rb +116 -0
  27. data/lib/woods/cache/solid_cache_store.rb +111 -0
  28. data/lib/woods/chunking/chunk.rb +84 -0
  29. data/lib/woods/chunking/semantic_chunker.rb +295 -0
  30. data/lib/woods/console/adapters/cache_adapter.rb +58 -0
  31. data/lib/woods/console/adapters/good_job_adapter.rb +33 -0
  32. data/lib/woods/console/adapters/job_adapter.rb +68 -0
  33. data/lib/woods/console/adapters/sidekiq_adapter.rb +33 -0
  34. data/lib/woods/console/adapters/solid_queue_adapter.rb +33 -0
  35. data/lib/woods/console/audit_logger.rb +75 -0
  36. data/lib/woods/console/bridge.rb +177 -0
  37. data/lib/woods/console/confirmation.rb +90 -0
  38. data/lib/woods/console/connection_manager.rb +173 -0
  39. data/lib/woods/console/console_response_renderer.rb +74 -0
  40. data/lib/woods/console/embedded_executor.rb +373 -0
  41. data/lib/woods/console/model_validator.rb +81 -0
  42. data/lib/woods/console/rack_middleware.rb +87 -0
  43. data/lib/woods/console/safe_context.rb +82 -0
  44. data/lib/woods/console/server.rb +612 -0
  45. data/lib/woods/console/sql_validator.rb +172 -0
  46. data/lib/woods/console/tools/tier1.rb +118 -0
  47. data/lib/woods/console/tools/tier2.rb +117 -0
  48. data/lib/woods/console/tools/tier3.rb +110 -0
  49. data/lib/woods/console/tools/tier4.rb +79 -0
  50. data/lib/woods/coordination/pipeline_lock.rb +109 -0
  51. data/lib/woods/cost_model/embedding_cost.rb +88 -0
  52. data/lib/woods/cost_model/estimator.rb +128 -0
  53. data/lib/woods/cost_model/provider_pricing.rb +67 -0
  54. data/lib/woods/cost_model/storage_cost.rb +52 -0
  55. data/lib/woods/cost_model.rb +22 -0
  56. data/lib/woods/db/migrations/001_create_units.rb +38 -0
  57. data/lib/woods/db/migrations/002_create_edges.rb +35 -0
  58. data/lib/woods/db/migrations/003_create_embeddings.rb +37 -0
  59. data/lib/woods/db/migrations/004_create_snapshots.rb +45 -0
  60. data/lib/woods/db/migrations/005_create_snapshot_units.rb +40 -0
  61. data/lib/woods/db/migrations/006_rename_tables.rb +34 -0
  62. data/lib/woods/db/migrator.rb +73 -0
  63. data/lib/woods/db/schema_version.rb +73 -0
  64. data/lib/woods/dependency_graph.rb +236 -0
  65. data/lib/woods/embedding/indexer.rb +140 -0
  66. data/lib/woods/embedding/openai.rb +126 -0
  67. data/lib/woods/embedding/provider.rb +162 -0
  68. data/lib/woods/embedding/text_preparer.rb +112 -0
  69. data/lib/woods/evaluation/baseline_runner.rb +115 -0
  70. data/lib/woods/evaluation/evaluator.rb +139 -0
  71. data/lib/woods/evaluation/metrics.rb +79 -0
  72. data/lib/woods/evaluation/query_set.rb +148 -0
  73. data/lib/woods/evaluation/report_generator.rb +90 -0
  74. data/lib/woods/extracted_unit.rb +145 -0
  75. data/lib/woods/extractor.rb +1028 -0
  76. data/lib/woods/extractors/action_cable_extractor.rb +201 -0
  77. data/lib/woods/extractors/ast_source_extraction.rb +46 -0
  78. data/lib/woods/extractors/behavioral_profile.rb +309 -0
  79. data/lib/woods/extractors/caching_extractor.rb +261 -0
  80. data/lib/woods/extractors/callback_analyzer.rb +246 -0
  81. data/lib/woods/extractors/concern_extractor.rb +292 -0
  82. data/lib/woods/extractors/configuration_extractor.rb +219 -0
  83. data/lib/woods/extractors/controller_extractor.rb +404 -0
  84. data/lib/woods/extractors/database_view_extractor.rb +278 -0
  85. data/lib/woods/extractors/decorator_extractor.rb +253 -0
  86. data/lib/woods/extractors/engine_extractor.rb +223 -0
  87. data/lib/woods/extractors/event_extractor.rb +211 -0
  88. data/lib/woods/extractors/factory_extractor.rb +289 -0
  89. data/lib/woods/extractors/graphql_extractor.rb +892 -0
  90. data/lib/woods/extractors/i18n_extractor.rb +117 -0
  91. data/lib/woods/extractors/job_extractor.rb +374 -0
  92. data/lib/woods/extractors/lib_extractor.rb +218 -0
  93. data/lib/woods/extractors/mailer_extractor.rb +269 -0
  94. data/lib/woods/extractors/manager_extractor.rb +188 -0
  95. data/lib/woods/extractors/middleware_extractor.rb +133 -0
  96. data/lib/woods/extractors/migration_extractor.rb +469 -0
  97. data/lib/woods/extractors/model_extractor.rb +988 -0
  98. data/lib/woods/extractors/phlex_extractor.rb +252 -0
  99. data/lib/woods/extractors/policy_extractor.rb +191 -0
  100. data/lib/woods/extractors/poro_extractor.rb +229 -0
  101. data/lib/woods/extractors/pundit_extractor.rb +223 -0
  102. data/lib/woods/extractors/rails_source_extractor.rb +473 -0
  103. data/lib/woods/extractors/rake_task_extractor.rb +343 -0
  104. data/lib/woods/extractors/route_extractor.rb +181 -0
  105. data/lib/woods/extractors/scheduled_job_extractor.rb +331 -0
  106. data/lib/woods/extractors/serializer_extractor.rb +339 -0
  107. data/lib/woods/extractors/service_extractor.rb +217 -0
  108. data/lib/woods/extractors/shared_dependency_scanner.rb +91 -0
  109. data/lib/woods/extractors/shared_utility_methods.rb +281 -0
  110. data/lib/woods/extractors/state_machine_extractor.rb +398 -0
  111. data/lib/woods/extractors/test_mapping_extractor.rb +225 -0
  112. data/lib/woods/extractors/validator_extractor.rb +211 -0
  113. data/lib/woods/extractors/view_component_extractor.rb +311 -0
  114. data/lib/woods/extractors/view_template_extractor.rb +261 -0
  115. data/lib/woods/feedback/gap_detector.rb +89 -0
  116. data/lib/woods/feedback/store.rb +119 -0
  117. data/lib/woods/filename_utils.rb +32 -0
  118. data/lib/woods/flow_analysis/operation_extractor.rb +206 -0
  119. data/lib/woods/flow_analysis/response_code_mapper.rb +154 -0
  120. data/lib/woods/flow_assembler.rb +290 -0
  121. data/lib/woods/flow_document.rb +191 -0
  122. data/lib/woods/flow_precomputer.rb +102 -0
  123. data/lib/woods/formatting/base.rb +30 -0
  124. data/lib/woods/formatting/claude_adapter.rb +98 -0
  125. data/lib/woods/formatting/generic_adapter.rb +56 -0
  126. data/lib/woods/formatting/gpt_adapter.rb +64 -0
  127. data/lib/woods/formatting/human_adapter.rb +78 -0
  128. data/lib/woods/graph_analyzer.rb +374 -0
  129. data/lib/woods/mcp/bootstrapper.rb +96 -0
  130. data/lib/woods/mcp/index_reader.rb +394 -0
  131. data/lib/woods/mcp/renderers/claude_renderer.rb +81 -0
  132. data/lib/woods/mcp/renderers/json_renderer.rb +17 -0
  133. data/lib/woods/mcp/renderers/markdown_renderer.rb +353 -0
  134. data/lib/woods/mcp/renderers/plain_renderer.rb +240 -0
  135. data/lib/woods/mcp/server.rb +962 -0
  136. data/lib/woods/mcp/tool_response_renderer.rb +85 -0
  137. data/lib/woods/model_name_cache.rb +51 -0
  138. data/lib/woods/notion/client.rb +217 -0
  139. data/lib/woods/notion/exporter.rb +219 -0
  140. data/lib/woods/notion/mapper.rb +40 -0
  141. data/lib/woods/notion/mappers/column_mapper.rb +57 -0
  142. data/lib/woods/notion/mappers/migration_mapper.rb +39 -0
  143. data/lib/woods/notion/mappers/model_mapper.rb +161 -0
  144. data/lib/woods/notion/mappers/shared.rb +22 -0
  145. data/lib/woods/notion/rate_limiter.rb +68 -0
  146. data/lib/woods/observability/health_check.rb +79 -0
  147. data/lib/woods/observability/instrumentation.rb +34 -0
  148. data/lib/woods/observability/structured_logger.rb +57 -0
  149. data/lib/woods/operator/error_escalator.rb +81 -0
  150. data/lib/woods/operator/pipeline_guard.rb +92 -0
  151. data/lib/woods/operator/status_reporter.rb +80 -0
  152. data/lib/woods/railtie.rb +38 -0
  153. data/lib/woods/resilience/circuit_breaker.rb +99 -0
  154. data/lib/woods/resilience/index_validator.rb +167 -0
  155. data/lib/woods/resilience/retryable_provider.rb +108 -0
  156. data/lib/woods/retrieval/context_assembler.rb +261 -0
  157. data/lib/woods/retrieval/query_classifier.rb +133 -0
  158. data/lib/woods/retrieval/ranker.rb +277 -0
  159. data/lib/woods/retrieval/search_executor.rb +316 -0
  160. data/lib/woods/retriever.rb +152 -0
  161. data/lib/woods/ruby_analyzer/class_analyzer.rb +170 -0
  162. data/lib/woods/ruby_analyzer/dataflow_analyzer.rb +77 -0
  163. data/lib/woods/ruby_analyzer/fqn_builder.rb +18 -0
  164. data/lib/woods/ruby_analyzer/mermaid_renderer.rb +280 -0
  165. data/lib/woods/ruby_analyzer/method_analyzer.rb +143 -0
  166. data/lib/woods/ruby_analyzer/trace_enricher.rb +143 -0
  167. data/lib/woods/ruby_analyzer.rb +87 -0
  168. data/lib/woods/session_tracer/file_store.rb +104 -0
  169. data/lib/woods/session_tracer/middleware.rb +143 -0
  170. data/lib/woods/session_tracer/redis_store.rb +106 -0
  171. data/lib/woods/session_tracer/session_flow_assembler.rb +254 -0
  172. data/lib/woods/session_tracer/session_flow_document.rb +223 -0
  173. data/lib/woods/session_tracer/solid_cache_store.rb +139 -0
  174. data/lib/woods/session_tracer/store.rb +81 -0
  175. data/lib/woods/storage/graph_store.rb +120 -0
  176. data/lib/woods/storage/metadata_store.rb +196 -0
  177. data/lib/woods/storage/pgvector.rb +195 -0
  178. data/lib/woods/storage/qdrant.rb +205 -0
  179. data/lib/woods/storage/vector_store.rb +167 -0
  180. data/lib/woods/temporal/json_snapshot_store.rb +245 -0
  181. data/lib/woods/temporal/snapshot_store.rb +345 -0
  182. data/lib/woods/token_utils.rb +19 -0
  183. data/lib/woods/version.rb +5 -0
  184. data/lib/woods.rb +246 -0
  185. metadata +270 -0
@@ -0,0 +1,1028 @@
1
+ # frozen_string_literal: true
2
+
3
+ require 'json'
4
+ require 'digest'
5
+ require 'fileutils'
6
+ require 'open3'
7
+ require 'pathname'
8
+ require 'set'
9
+
10
+ require_relative 'filename_utils'
11
+ require_relative 'extracted_unit'
12
+ require_relative 'dependency_graph'
13
+ require_relative 'extractors/model_extractor'
14
+ require_relative 'extractors/controller_extractor'
15
+ require_relative 'extractors/phlex_extractor'
16
+ require_relative 'extractors/service_extractor'
17
+ require_relative 'extractors/job_extractor'
18
+ require_relative 'extractors/mailer_extractor'
19
+ require_relative 'extractors/graphql_extractor'
20
+ require_relative 'extractors/serializer_extractor'
21
+ require_relative 'extractors/rails_source_extractor'
22
+ require_relative 'extractors/view_component_extractor'
23
+ require_relative 'extractors/manager_extractor'
24
+ require_relative 'extractors/policy_extractor'
25
+ require_relative 'extractors/validator_extractor'
26
+ require_relative 'extractors/concern_extractor'
27
+ require_relative 'extractors/route_extractor'
28
+ require_relative 'extractors/middleware_extractor'
29
+ require_relative 'extractors/i18n_extractor'
30
+ require_relative 'extractors/pundit_extractor'
31
+ require_relative 'extractors/configuration_extractor'
32
+ require_relative 'extractors/engine_extractor'
33
+ require_relative 'extractors/view_template_extractor'
34
+ require_relative 'extractors/migration_extractor'
35
+ require_relative 'extractors/action_cable_extractor'
36
+ require_relative 'extractors/scheduled_job_extractor'
37
+ require_relative 'extractors/rake_task_extractor'
38
+ require_relative 'extractors/state_machine_extractor'
39
+ require_relative 'extractors/event_extractor'
40
+ require_relative 'extractors/decorator_extractor'
41
+ require_relative 'extractors/database_view_extractor'
42
+ require_relative 'extractors/caching_extractor'
43
+ require_relative 'extractors/factory_extractor'
44
+ require_relative 'extractors/test_mapping_extractor'
45
+ require_relative 'extractors/poro_extractor'
46
+ require_relative 'extractors/lib_extractor'
47
+ require_relative 'graph_analyzer'
48
+ require_relative 'model_name_cache'
49
+ require_relative 'flow_precomputer'
50
+
51
+ module Woods
52
+ # Extractor is the main orchestrator for codebase extraction.
53
+ #
54
+ # It coordinates all individual extractors, builds the dependency graph,
55
+ # enriches with git data, and outputs structured JSON for the indexing pipeline.
56
+ #
57
+ # @example Full extraction
58
+ # extractor = Extractor.new(output_dir: "tmp/woods")
59
+ # results = extractor.extract_all
60
+ #
61
+ # @example Incremental extraction (for CI)
62
+ # extractor = Extractor.new
63
+ # extractor.extract_changed(["app/models/user.rb", "app/services/checkout.rb"])
64
+ #
65
+ class Extractor
66
+ include FilenameUtils
67
+
68
+ # Directories under app/ that contain classes we need to extract.
69
+ # Used by eager_load_extraction_directories as a fallback when
70
+ # Rails.application.eager_load! fails (e.g., NameError from graphql/).
71
+ EXTRACTION_DIRECTORIES = %w[
72
+ models
73
+ controllers
74
+ services
75
+ jobs
76
+ mailers
77
+ components
78
+ interactors
79
+ operations
80
+ commands
81
+ use_cases
82
+ serializers
83
+ decorators
84
+ blueprinters
85
+ managers
86
+ policies
87
+ validators
88
+ channels
89
+ presenters
90
+ form_objects
91
+ ].freeze
92
+
93
+ EXTRACTORS = {
94
+ models: Extractors::ModelExtractor,
95
+ controllers: Extractors::ControllerExtractor,
96
+ graphql: Extractors::GraphQLExtractor,
97
+ components: Extractors::PhlexExtractor,
98
+ view_components: Extractors::ViewComponentExtractor,
99
+ services: Extractors::ServiceExtractor,
100
+ jobs: Extractors::JobExtractor,
101
+ mailers: Extractors::MailerExtractor,
102
+ serializers: Extractors::SerializerExtractor,
103
+ managers: Extractors::ManagerExtractor,
104
+ policies: Extractors::PolicyExtractor,
105
+ validators: Extractors::ValidatorExtractor,
106
+ concerns: Extractors::ConcernExtractor,
107
+ routes: Extractors::RouteExtractor,
108
+ middleware: Extractors::MiddlewareExtractor,
109
+ i18n: Extractors::I18nExtractor,
110
+ pundit_policies: Extractors::PunditExtractor,
111
+ configurations: Extractors::ConfigurationExtractor,
112
+ engines: Extractors::EngineExtractor,
113
+ view_templates: Extractors::ViewTemplateExtractor,
114
+ migrations: Extractors::MigrationExtractor,
115
+ action_cable_channels: Extractors::ActionCableExtractor,
116
+ scheduled_jobs: Extractors::ScheduledJobExtractor,
117
+ rake_tasks: Extractors::RakeTaskExtractor,
118
+ state_machines: Extractors::StateMachineExtractor,
119
+ events: Extractors::EventExtractor,
120
+ decorators: Extractors::DecoratorExtractor,
121
+ database_views: Extractors::DatabaseViewExtractor,
122
+ caching: Extractors::CachingExtractor,
123
+ factories: Extractors::FactoryExtractor,
124
+ test_mappings: Extractors::TestMappingExtractor,
125
+ rails_source: Extractors::RailsSourceExtractor,
126
+ poros: Extractors::PoroExtractor,
127
+ libs: Extractors::LibExtractor
128
+ }.freeze
129
+
130
+ # Maps singular unit types (as stored in ExtractedUnit/graph nodes)
131
+ # to the plural keys used in the EXTRACTORS constant.
132
+ #
133
+ # @return [Hash{Symbol => Symbol}]
134
+ TYPE_TO_EXTRACTOR_KEY = {
135
+ model: :models,
136
+ controller: :controllers,
137
+ service: :services,
138
+ component: :components,
139
+ view_component: :view_components,
140
+ job: :jobs,
141
+ mailer: :mailers,
142
+ graphql_type: :graphql,
143
+ graphql_mutation: :graphql,
144
+ graphql_resolver: :graphql,
145
+ graphql_query: :graphql,
146
+ serializer: :serializers,
147
+ manager: :managers,
148
+ policy: :policies,
149
+ validator: :validators,
150
+ concern: :concerns,
151
+ route: :routes,
152
+ middleware: :middleware,
153
+ i18n: :i18n,
154
+ pundit_policy: :pundit_policies,
155
+ configuration: :configurations,
156
+ engine: :engines,
157
+ view_template: :view_templates,
158
+ migration: :migrations,
159
+ action_cable_channel: :action_cable_channels,
160
+ scheduled_job: :scheduled_jobs,
161
+ rake_task: :rake_tasks,
162
+ state_machine: :state_machines,
163
+ event: :events,
164
+ decorator: :decorators,
165
+ database_view: :database_views,
166
+ caching: :caching,
167
+ factory: :factories,
168
+ test_mapping: :test_mappings,
169
+ rails_source: :rails_source,
170
+ poro: :poros,
171
+ lib: :libs
172
+ }.freeze
173
+
174
+ # Maps unit types to class-based extractor methods (constantize + call).
175
+ CLASS_BASED = {
176
+ model: :extract_model, controller: :extract_controller,
177
+ component: :extract_component, view_component: :extract_component,
178
+ mailer: :extract_mailer, action_cable_channel: :extract_channel
179
+ }.freeze
180
+
181
+ # Maps unit types to file-based extractor methods (pass file_path).
182
+ FILE_BASED = {
183
+ service: :extract_service_file, job: :extract_job_file,
184
+ serializer: :extract_serializer_file, manager: :extract_manager_file,
185
+ policy: :extract_policy_file, validator: :extract_validator_file,
186
+ concern: :extract_concern_file,
187
+ i18n: :extract_i18n_file,
188
+ pundit_policy: :extract_pundit_file,
189
+ configuration: :extract_configuration_file,
190
+ view_template: :extract_view_template_file,
191
+ migration: :extract_migration_file,
192
+ rake_task: :extract_rake_file,
193
+ decorator: :extract_decorator_file,
194
+ database_view: :extract_view_file,
195
+ caching: :extract_caching_file,
196
+ test_mapping: :extract_test_file,
197
+ poro: :extract_poro_file,
198
+ lib: :extract_lib_file
199
+ }.freeze
200
+
201
+ # GraphQL types all use the same extractor method.
202
+ GRAPHQL_TYPES = %i[graphql_type graphql_mutation graphql_resolver graphql_query].freeze
203
+
204
+ attr_reader :output_dir, :dependency_graph
205
+
206
+ def initialize(output_dir: nil)
207
+ @output_dir = Pathname.new(output_dir || Rails.root.join('tmp/woods'))
208
+ @dependency_graph = DependencyGraph.new
209
+ @results = {}
210
+ @extractors = {}
211
+ end
212
+
213
+ # ══════════════════════════════════════════════════════════════════════
214
+ # Full Extraction
215
+ # ══════════════════════════════════════════════════════════════════════
216
+
217
+ # Perform full extraction of the codebase
218
+ #
219
+ # @return [Hash] Results keyed by extractor type
220
+ def extract_all
221
+ setup_output_directory
222
+ ModelNameCache.reset!
223
+
224
+ # Eager load once — all extractors need loaded classes for introspection.
225
+ safe_eager_load!
226
+
227
+ # Phase 1: Extract all units
228
+ if Woods.configuration.concurrent_extraction
229
+ extract_all_concurrent
230
+ else
231
+ extract_all_sequential
232
+ end
233
+
234
+ # Phase 1.5: Deduplicate results
235
+ Rails.logger.info '[Woods] Deduplicating results...'
236
+ deduplicate_results
237
+
238
+ # Rebuild graph from deduped results — Phase 1 registered all units including
239
+ # duplicates, and DependencyGraph has no remove/unregister API.
240
+ @dependency_graph = DependencyGraph.new
241
+ @results.each_value { |units| units.each { |u| @dependency_graph.register(u) } }
242
+
243
+ # Phase 2: Resolve dependents (reverse dependencies)
244
+ Rails.logger.info '[Woods] Resolving dependents...'
245
+ resolve_dependents
246
+
247
+ # Phase 3: Graph analysis (PageRank, structural metrics)
248
+ Rails.logger.info '[Woods] Analyzing dependency graph...'
249
+ @graph_analysis = GraphAnalyzer.new(@dependency_graph).analyze
250
+
251
+ # Phase 3.5: Precompute request flows (opt-in)
252
+ if Woods.configuration.precompute_flows
253
+ Rails.logger.info '[Woods] Precomputing request flows...'
254
+ precompute_flows
255
+ end
256
+
257
+ # Phase 4: Enrich with git data
258
+ Rails.logger.info '[Woods] Enriching with git data...'
259
+ enrich_with_git_data
260
+
261
+ # Phase 4.5: Normalize file_path to relative paths
262
+ Rails.logger.info '[Woods] Normalizing file paths...'
263
+ normalize_file_paths
264
+
265
+ # Phase 5: Write output
266
+ Rails.logger.info '[Woods] Writing output...'
267
+ write_results
268
+ write_dependency_graph
269
+ write_graph_analysis
270
+ write_manifest
271
+ write_structural_summary
272
+ capture_snapshot
273
+
274
+ log_summary
275
+
276
+ @results
277
+ end
278
+
279
+ # ══════════════════════════════════════════════════════════════════════
280
+ # Incremental Extraction
281
+ # ══════════════════════════════════════════════════════════════════════
282
+
283
+ # Extract only units affected by changed files
284
+ # Used for incremental indexing in CI
285
+ #
286
+ # @param changed_files [Array<String>] List of changed file paths
287
+ # @return [Array<String>] List of re-extracted unit identifiers
288
+ def extract_changed(changed_files)
289
+ # Load existing graph
290
+ graph_path = @output_dir.join('dependency_graph.json')
291
+ @dependency_graph = DependencyGraph.from_h(JSON.parse(File.read(graph_path))) if graph_path.exist?
292
+
293
+ ModelNameCache.reset!
294
+
295
+ # Eager load to ensure newly-added classes are discoverable.
296
+ safe_eager_load!
297
+
298
+ # Normalize relative paths (from git diff) to absolute (as stored in file_map)
299
+ absolute_files = changed_files.map do |f|
300
+ Pathname.new(f).absolute? ? f : Rails.root.join(f).to_s
301
+ end
302
+
303
+ # Compute affected units
304
+ affected_ids = @dependency_graph.affected_by(absolute_files)
305
+ Rails.logger.info "[Woods] #{changed_files.size} changed files affect #{affected_ids.size} units"
306
+
307
+ # Re-extract affected units
308
+ affected_types = Set.new
309
+ affected_ids.each do |unit_id|
310
+ re_extract_unit(unit_id, affected_types: affected_types)
311
+ end
312
+
313
+ # Regenerate type indexes for affected types
314
+ affected_types.each do |type_key|
315
+ regenerate_type_index(type_key)
316
+ end
317
+
318
+ # Update graph, manifest, and summary
319
+ write_dependency_graph
320
+ write_manifest
321
+ write_structural_summary
322
+ capture_snapshot
323
+
324
+ affected_ids
325
+ end
326
+
327
+ private
328
+
329
+ # ──────────────────────────────────────────────────────────────────────
330
+ # Eager Loading
331
+ # ──────────────────────────────────────────────────────────────────────
332
+
333
+ # Attempt eager_load!, falling back to per-directory loading on NameError.
334
+ #
335
+ # A single NameError (e.g., app/graphql/ referencing an uninstalled gem)
336
+ # aborts eager_load! entirely. Zeitwerk processes dirs alphabetically,
337
+ # so graphql/ before models/ means models never load. The fallback
338
+ # loads only the directories we actually need for extraction.
339
+ def safe_eager_load!
340
+ Rails.application.eager_load!
341
+ rescue NameError => e
342
+ Rails.logger.warn "[Woods] eager_load! hit NameError: #{e.message}"
343
+ Rails.logger.warn '[Woods] Falling back to per-directory eager loading'
344
+ eager_load_extraction_directories
345
+ end
346
+
347
+ # Load classes from each extraction-relevant app/ subdirectory individually.
348
+ # Uses Zeitwerk's eager_load_dir when available (Rails 7.1+/Zeitwerk 2.6+),
349
+ # otherwise falls back to Dir.glob + require.
350
+ def eager_load_extraction_directories
351
+ loader = Rails.autoloaders.main
352
+
353
+ EXTRACTION_DIRECTORIES.each do |subdir|
354
+ dir = Rails.root.join('app', subdir)
355
+ next unless dir.exist?
356
+
357
+ begin
358
+ if loader.respond_to?(:eager_load_dir)
359
+ loader.eager_load_dir(dir.to_s)
360
+ else
361
+ Dir.glob(dir.join('**/*.rb')).each do |file|
362
+ require file
363
+ rescue NameError, LoadError => e
364
+ Rails.logger.warn "[Woods] Skipped #{file}: #{e.message}"
365
+ end
366
+ end
367
+ rescue NameError, LoadError => e
368
+ Rails.logger.warn "[Woods] Failed to eager load app/#{subdir}/: #{e.message}"
369
+ end
370
+ end
371
+ end
372
+
373
+ # ──────────────────────────────────────────────────────────────────────
374
+ # Extraction Strategies
375
+ # ──────────────────────────────────────────────────────────────────────
376
+
377
+ def extract_all_sequential
378
+ EXTRACTORS.each do |type, extractor_class|
379
+ Rails.logger.info "[Woods] Extracting #{type}..."
380
+ start_time = Time.current
381
+
382
+ extractor = extractor_class.new
383
+ @extractors[type] = extractor
384
+ units = extractor.extract_all
385
+
386
+ @results[type] = units
387
+
388
+ elapsed = Time.current - start_time
389
+ Rails.logger.info "[Woods] Extracted #{units.size} #{type} in #{elapsed.round(2)}s"
390
+
391
+ # Register in dependency graph
392
+ units.each { |unit| @dependency_graph.register(unit) }
393
+ end
394
+ end
395
+
396
+ # Run each extractor in its own thread, then register results sequentially.
397
+ #
398
+ # Thread safety notes:
399
+ # - ModelNameCache is pre-computed before threads start (avoids ||= race)
400
+ # - Each thread gets its own extractor instance (no shared mutable state)
401
+ # - Results collected via Mutex-protected Hash
402
+ # - DependencyGraph registration is sequential (post-join)
403
+ def extract_all_concurrent
404
+ # Pre-compute ModelNameCache to avoid race on lazy memoization.
405
+ # Multiple threads calling model_names concurrently could trigger
406
+ # duplicate compute_model_names calls without this warm-up.
407
+ ModelNameCache.model_names
408
+ ModelNameCache.model_names_regex
409
+
410
+ results_mutex = Mutex.new
411
+ threads = EXTRACTORS.map do |type, extractor_class|
412
+ Thread.new do
413
+ Rails.logger.info "[Woods] [Thread] Extracting #{type}..."
414
+ start_time = Time.current
415
+
416
+ extractor = extractor_class.new
417
+ results_mutex.synchronize { @extractors[type] = extractor }
418
+
419
+ units = extractor.extract_all
420
+
421
+ elapsed = Time.current - start_time
422
+ Rails.logger.info "[Woods] [Thread] Extracted #{units.size} #{type} in #{elapsed.round(2)}s"
423
+
424
+ results_mutex.synchronize do
425
+ @results[type] = units
426
+ end
427
+ rescue StandardError => e
428
+ Rails.logger.error "[Woods] [Thread] #{type} failed: #{e.message}"
429
+ results_mutex.synchronize { @results[type] = [] }
430
+ end
431
+ end
432
+
433
+ threads.each(&:join)
434
+
435
+ # Register into dependency graph sequentially — DependencyGraph is not thread-safe
436
+ EXTRACTORS.each_key do |type|
437
+ (@results[type] || []).each { |unit| @dependency_graph.register(unit) }
438
+ end
439
+ end
440
+
441
+ # ──────────────────────────────────────────────────────────────────────
442
+ # Setup
443
+ # ──────────────────────────────────────────────────────────────────────
444
+
445
+ def setup_output_directory
446
+ FileUtils.mkdir_p(@output_dir)
447
+ EXTRACTORS.each_key do |type|
448
+ FileUtils.mkdir_p(@output_dir.join(type.to_s))
449
+ end
450
+ end
451
+
452
+ # ──────────────────────────────────────────────────────────────────────
453
+ # Dependency Resolution
454
+ # ──────────────────────────────────────────────────────────────────────
455
+
456
+ def resolve_dependents
457
+ # Build complete unit map first (cross-type dependencies require all units indexed).
458
+ unit_map = @results.each_with_object({}) do |(_type, units), map|
459
+ units.each { |u| map[u.identifier] = u }
460
+ end
461
+
462
+ # Resolve dependents using the complete map.
463
+ @results.each_value do |units|
464
+ units.each do |unit|
465
+ unit.dependencies.each do |dep|
466
+ target_unit = unit_map[dep[:target]]
467
+ next unless target_unit
468
+
469
+ target_unit.dependents ||= []
470
+ target_unit.dependents << {
471
+ type: unit.type,
472
+ identifier: unit.identifier
473
+ }
474
+ end
475
+ end
476
+ end
477
+ end
478
+
479
+ # Remove duplicate units (same identifier) within each type, keeping the first occurrence.
480
+ # Duplicates arise when multiple extractors produce the same unit (e.g., engine-mounted
481
+ # routes duplicating app routes). Without dedup, downstream phases would produce inflated
482
+ # counts, duplicate _index.json entries, and last-writer-wins file overwrites.
483
+ def deduplicate_results
484
+ @results.each do |type, units|
485
+ deduped = units.uniq(&:identifier)
486
+ dropped = units.size - deduped.size
487
+
488
+ Rails.logger.warn "[Woods] Deduplicated #{type}: dropped #{dropped} duplicate(s)" if dropped.positive?
489
+
490
+ @results[type] = deduped
491
+ end
492
+ end
493
+
494
+ # ──────────────────────────────────────────────────────────────────────
495
+ # Flow Precomputation
496
+ # ──────────────────────────────────────────────────────────────────────
497
+
498
+ def precompute_flows
499
+ all_units = @results.values.flatten(1)
500
+ precomputer = FlowPrecomputer.new(units: all_units, graph: @dependency_graph, output_dir: @output_dir.to_s)
501
+ flow_map = precomputer.precompute
502
+ Rails.logger.info "[Woods] Precomputed #{flow_map.size} request flows"
503
+ rescue StandardError => e
504
+ Rails.logger.error "[Woods] Flow precomputation failed: #{e.message}"
505
+ end
506
+
507
+ # ──────────────────────────────────────────────────────────────────────
508
+ # Git Enrichment
509
+ # ──────────────────────────────────────────────────────────────────────
510
+
511
+ def enrich_with_git_data
512
+ return unless git_available?
513
+
514
+ # Collect all file paths that need git data
515
+ file_paths = []
516
+ @results.each do |type, units|
517
+ next if %i[rails_source gem_source].include?(type)
518
+
519
+ units.each do |unit|
520
+ file_paths << unit.file_path if unit.file_path && File.exist?(unit.file_path)
521
+ end
522
+ end
523
+
524
+ # Batch-fetch all git data in minimal subprocess calls
525
+ git_data = batch_git_data(file_paths)
526
+ root = "#{Rails.root}/"
527
+
528
+ # Assign results to units
529
+ @results.each do |type, units|
530
+ next if %i[rails_source gem_source].include?(type)
531
+
532
+ units.each do |unit|
533
+ next unless unit.file_path
534
+
535
+ relative = unit.file_path.sub(root, '')
536
+ unit.metadata[:git] = git_data[relative] if git_data[relative]
537
+ end
538
+ end
539
+ end
540
+
541
+ # Normalize all unit file_paths to relative paths (relative to Rails.root).
542
+ #
543
+ # Extractors set file_path via source_location, which returns absolute paths.
544
+ # This normalization ensures consistent relative paths (e.g., "app/models/user.rb")
545
+ # across all environments (local, Docker, CI) where Rails.root differs.
546
+ #
547
+ # Must run after enrich_with_git_data, which needs absolute paths for
548
+ # File.exist? checks and git log commands.
549
+ def normalize_file_paths
550
+ @results.each_value do |units|
551
+ units.each do |unit|
552
+ unit.file_path = normalize_file_path(unit.file_path)
553
+ end
554
+ end
555
+ end
556
+
557
+ # Strip Rails.root prefix from a file path, converting it to a relative path.
558
+ #
559
+ # @param path [String, nil] Absolute or relative file path
560
+ # @return [String, nil] Relative path, or the original value if already relative,
561
+ # nil, or not under Rails.root (e.g., a gem path)
562
+ def normalize_file_path(path)
563
+ return path unless path
564
+
565
+ root = Rails.root.to_s
566
+ prefix = root.end_with?('/') ? root : "#{root}/"
567
+ path.start_with?(prefix) ? path.sub(prefix, '') : path
568
+ end
569
+
570
+ def git_available?
571
+ return @git_available if defined?(@git_available)
572
+
573
+ @git_available = begin
574
+ _, status = Open3.capture2('git', 'rev-parse', '--git-dir')
575
+ status.success?
576
+ rescue StandardError
577
+ false
578
+ end
579
+ end
580
+
581
+ # Safe git command execution — no shell interpolation
582
+ #
583
+ # @param args [Array<String>] Git command arguments
584
+ # @return [String] Command output (empty string on failure)
585
+ def run_git(*args)
586
+ output, status = Open3.capture2('git', *args)
587
+ status.success? ? output.strip : ''
588
+ rescue StandardError
589
+ ''
590
+ end
591
+
592
+ # Batch-fetch git data for all file paths in two git commands.
593
+ #
594
+ # @param file_paths [Array<String>] Absolute file paths
595
+ # @return [Hash{String => Hash}] Keyed by relative path
596
+ def batch_git_data(file_paths)
597
+ return {} if file_paths.empty?
598
+
599
+ root = "#{Rails.root}/"
600
+ relative_paths = file_paths.map { |f| f.sub(root, '') }
601
+ result = {}
602
+ relative_paths.each { |rp| result[rp] = {} }
603
+
604
+ path_set = relative_paths.to_set
605
+ relative_paths.each_slice(500) do |batch|
606
+ log_output = run_git(
607
+ 'log', '--all', '--name-only',
608
+ '--format=__COMMIT__%H|||%an|||%cI|||%s',
609
+ '--since=365 days ago',
610
+ '--', *batch
611
+ )
612
+ parse_git_log_output(log_output, path_set, result)
613
+ end
614
+
615
+ ninety_days_ago = (Time.current - 90.days).iso8601
616
+ result.each do |relative_path, data|
617
+ result[relative_path] = build_file_metadata(data, ninety_days_ago)
618
+ end
619
+
620
+ result
621
+ end
622
+
623
+ # Parse git log output line-by-line, populating result with per-file commit data.
624
+ def parse_git_log_output(log_output, path_set, result)
625
+ current_commit = nil
626
+
627
+ log_output.each_line do |line|
628
+ line = line.strip
629
+ next if line.empty?
630
+
631
+ if line.start_with?('__COMMIT__')
632
+ parts = line.sub('__COMMIT__', '').split('|||', 4)
633
+ current_commit = { sha: parts[0], author: parts[1], date: parts[2], message: parts[3] }
634
+ elsif current_commit && path_set.include?(line)
635
+ entry = result[line] ||= {}
636
+ unless entry[:last_modified]
637
+ entry[:last_modified] = current_commit[:date]
638
+ entry[:last_author] = current_commit[:author]
639
+ end
640
+ (entry[:commits] ||= []) << current_commit
641
+ (entry[:contributors] ||= Hash.new(0))[current_commit[:author]] += 1
642
+ end
643
+ end
644
+ end
645
+
646
+ # Classify how frequently a file changes based on commit counts.
647
+ def classify_change_frequency(total_count, recent_count)
648
+ if total_count <= 2
649
+ :new
650
+ elsif recent_count >= 10
651
+ :hot
652
+ elsif recent_count >= 3
653
+ :active
654
+ elsif recent_count >= 1
655
+ :stable
656
+ else
657
+ :dormant
658
+ end
659
+ end
660
+
661
+ # Build final metadata hash from raw commit data.
662
+ def build_file_metadata(data, ninety_days_ago)
663
+ all_commits = data[:commits] || []
664
+ contributor_counts = data[:contributors] || {}
665
+ recent_count = all_commits.count { |c| c[:date] && c[:date] > ninety_days_ago }
666
+
667
+ {
668
+ last_modified: data[:last_modified],
669
+ last_author: data[:last_author],
670
+ commit_count: all_commits.size,
671
+ contributors: contributor_counts
672
+ .sort_by { |_, count| -count }
673
+ .first(5)
674
+ .map { |name, count| { name: name, commits: count } },
675
+ recent_commits: all_commits.first(5).map do |c|
676
+ { sha: c[:sha]&.first(8), message: c[:message], date: c[:date], author: c[:author] }
677
+ end,
678
+ change_frequency: classify_change_frequency(all_commits.size, recent_count)
679
+ }
680
+ end
681
+
682
+ # ──────────────────────────────────────────────────────────────────────
683
+ # Output Writers
684
+ # ──────────────────────────────────────────────────────────────────────
685
+
686
+ def write_results
687
+ @results.each do |type, units|
688
+ type_dir = @output_dir.join(type.to_s)
689
+
690
+ units.each do |unit|
691
+ File.write(
692
+ type_dir.join(collision_safe_filename(unit.identifier)),
693
+ json_serialize(unit.to_h)
694
+ )
695
+ end
696
+
697
+ # Also write a type index for fast lookups
698
+ index = units.map do |u|
699
+ {
700
+ identifier: u.identifier,
701
+ file_path: u.file_path,
702
+ namespace: u.namespace,
703
+ estimated_tokens: u.estimated_tokens,
704
+ chunk_count: u.chunks.size
705
+ }
706
+ end
707
+
708
+ File.write(
709
+ type_dir.join('_index.json'),
710
+ json_serialize(index)
711
+ )
712
+ end
713
+ end
714
+
715
+ def write_dependency_graph
716
+ graph_data = @dependency_graph.to_h
717
+ graph_data[:pagerank] = @dependency_graph.pagerank
718
+
719
+ File.write(
720
+ @output_dir.join('dependency_graph.json'),
721
+ json_serialize(graph_data)
722
+ )
723
+ end
724
+
725
+ def write_graph_analysis
726
+ return unless @graph_analysis
727
+
728
+ enriched = @graph_analysis.merge(
729
+ generated_at: Time.current.iso8601,
730
+ graph_sha: Digest::SHA256.hexdigest(
731
+ File.read(@output_dir.join('dependency_graph.json'))
732
+ )
733
+ )
734
+
735
+ File.write(
736
+ @output_dir.join('graph_analysis.json'),
737
+ json_serialize(enriched)
738
+ )
739
+ end
740
+
741
+ def write_manifest
742
+ manifest = {
743
+ extracted_at: Time.current.iso8601,
744
+ rails_version: Rails.version,
745
+ ruby_version: RUBY_VERSION,
746
+
747
+ # Counts by type
748
+ counts: @results.transform_values(&:size),
749
+
750
+ # Total stats
751
+ total_units: @results.values.sum(&:size),
752
+ total_chunks: @results.sum { |_, units| units.sum { |u| u.chunks.size } },
753
+
754
+ # Git info — fall back to env vars for Docker/worktree environments
755
+ # where the git repo may not be directly accessible
756
+ git_sha: run_git('rev-parse', 'HEAD').presence || ENV['GIT_SHA'].presence,
757
+ git_branch: run_git('rev-parse', '--abbrev-ref', 'HEAD').presence || ENV['GIT_BRANCH'].presence,
758
+
759
+ # For change detection
760
+ gemfile_lock_sha: gemfile_lock_sha,
761
+ schema_sha: schema_sha
762
+ }
763
+
764
+ File.write(
765
+ @output_dir.join('manifest.json'),
766
+ json_serialize(manifest)
767
+ )
768
+ end
769
+
770
+ # Capture a temporal snapshot after extraction completes.
771
+ #
772
+ # Reads the manifest and computes per-unit content hashes, then delegates
773
+ # to the SnapshotStore for storage and diff computation. Requires
774
+ # enable_snapshots and a valid git_sha in the manifest.
775
+ #
776
+ # @return [void]
777
+ def capture_snapshot
778
+ return unless Woods.configuration.enable_snapshots
779
+
780
+ manifest_path = @output_dir.join('manifest.json')
781
+ return unless manifest_path.exist?
782
+
783
+ manifest = JSON.parse(File.read(manifest_path))
784
+ return unless manifest['git_sha']
785
+
786
+ store = build_snapshot_store
787
+ return unless store
788
+
789
+ unit_hashes = @results.flat_map do |type, units|
790
+ units.map do |unit|
791
+ {
792
+ 'identifier' => unit.identifier,
793
+ 'type' => type.to_s,
794
+ 'source_hash' => Digest::SHA256.hexdigest(unit.source_code.to_s),
795
+ 'metadata_hash' => Digest::SHA256.hexdigest(unit.metadata.to_json),
796
+ 'dependencies_hash' => Digest::SHA256.hexdigest(unit.dependencies.to_json)
797
+ }
798
+ end
799
+ end
800
+
801
+ store.capture(manifest, unit_hashes)
802
+ Rails.logger.info "[Woods] Snapshot captured for #{manifest['git_sha'][0..7]}"
803
+ rescue StandardError => e
804
+ Rails.logger.error "[Woods] Snapshot capture failed (#{e.class}): #{e.message}"
805
+ end
806
+
807
+ # Build a snapshot store, preferring SQLite with JSON file fallback.
808
+ #
809
+ # @return [Woods::Temporal::SnapshotStore, Woods::Temporal::JsonSnapshotStore, nil]
810
+ def build_snapshot_store
811
+ require 'sqlite3'
812
+ require_relative 'db/migrator'
813
+ require_relative 'temporal/snapshot_store'
814
+
815
+ db_path = @output_dir.join('woods.sqlite3')
816
+ db = SQLite3::Database.new(db_path.to_s)
817
+ db.results_as_hash = true
818
+
819
+ Db::Migrator.new(connection: db).migrate!
820
+ Temporal::SnapshotStore.new(connection: db)
821
+ rescue LoadError
822
+ Rails.logger.info '[Woods] sqlite3 gem not available, using JSON snapshot store'
823
+ require_relative 'temporal/json_snapshot_store'
824
+ Temporal::JsonSnapshotStore.new(dir: @output_dir.to_s)
825
+ end
826
+
827
+ # Write a compact TOC-style summary of extracted units.
828
+ #
829
+ # Produces a SUMMARY.md under 8K tokens (~24KB) by listing one line per
830
+ # category with count and top-5 namespace breakdown, rather than enumerating
831
+ # every unit. Per-unit detail is available in the per-category _index.json files.
832
+ #
833
+ # @return [void]
834
+ def write_structural_summary
835
+ return if @results.empty?
836
+
837
+ total_units = @results.values.sum(&:size)
838
+ total_chunks = @results.sum { |_, units| units.sum { |u| [u.chunks.size, 1].max } }
839
+ category_count = @results.count { |_, units| units.any? }
840
+
841
+ summary = []
842
+ summary << '# Codebase Index Summary'
843
+ summary << "Generated: #{Time.current.iso8601}"
844
+ summary << "Rails #{Rails.version} / Ruby #{RUBY_VERSION}"
845
+ summary << "Units: #{total_units} | Chunks: #{total_chunks} | Categories: #{category_count}"
846
+ summary << ''
847
+
848
+ @results.each do |type, units|
849
+ next if units.empty?
850
+
851
+ summary << "## #{type.to_s.titleize} (#{units.size})"
852
+
853
+ ns_counts = units
854
+ .group_by { |u| u.namespace.nil? || u.namespace.empty? ? '(root)' : u.namespace }
855
+ .transform_values(&:size)
856
+ .sort_by { |_, count| -count }
857
+ .first(5)
858
+
859
+ ns_parts = ns_counts.map { |ns, count| "#{ns} #{count}" }
860
+ summary << "Namespaces: #{ns_parts.join(', ')}" unless ns_parts.empty?
861
+ summary << ''
862
+ end
863
+
864
+ summary << '## Dependency Overview'
865
+ summary << ''
866
+
867
+ graph_stats = @dependency_graph.to_h[:stats]
868
+ if graph_stats
869
+ summary << "- Total nodes: #{graph_stats[:node_count]}"
870
+ summary << "- Total edges: #{graph_stats[:edge_count]}"
871
+ end
872
+
873
+ if @graph_analysis
874
+ hub_nodes = @graph_analysis[:hubs]
875
+ significant_hubs = hub_nodes&.select { |h| h[:dependent_count] > 20 }
876
+ if significant_hubs&.any?
877
+ hub_names = significant_hubs.map { |h| h[:identifier] }.join(', ')
878
+ summary << "- Hub nodes (>20 dependents): #{hub_names}"
879
+ end
880
+ end
881
+
882
+ summary << ''
883
+
884
+ File.write(
885
+ @output_dir.join('SUMMARY.md'),
886
+ summary.join("\n")
887
+ )
888
+ end
889
+
890
+ def regenerate_type_index(type_key)
891
+ type_dir = @output_dir.join(type_key.to_s)
892
+ return unless type_dir.directory?
893
+
894
+ # Scan existing unit JSON files (exclude _index.json)
895
+ index = Dir[type_dir.join('*.json')].filter_map do |file|
896
+ next if File.basename(file) == '_index.json'
897
+
898
+ data = JSON.parse(File.read(file))
899
+ {
900
+ identifier: data['identifier'],
901
+ file_path: data['file_path'],
902
+ namespace: data['namespace'],
903
+ estimated_tokens: data['estimated_tokens'],
904
+ chunk_count: (data['chunks'] || []).size
905
+ }
906
+ end
907
+
908
+ File.write(
909
+ type_dir.join('_index.json'),
910
+ json_serialize(index)
911
+ )
912
+ end
913
+
914
+ # ──────────────────────────────────────────────────────────────────────
915
+ # Helpers
916
+ # ──────────────────────────────────────────────────────────────────────
917
+
918
+ def gemfile_lock_sha
919
+ lock_path = Rails.root.join('Gemfile.lock')
920
+ return nil unless lock_path.exist?
921
+
922
+ Digest::SHA256.file(lock_path).hexdigest
923
+ end
924
+
925
+ def schema_sha
926
+ %w[db/schema.rb db/structure.sql].each do |path|
927
+ full = Rails.root.join(path)
928
+ return Digest::SHA256.file(full).hexdigest if full.exist?
929
+ end
930
+ nil
931
+ end
932
+
933
+ def json_serialize(data)
934
+ if Woods.configuration.pretty_json
935
+ JSON.pretty_generate(data)
936
+ else
937
+ JSON.generate(data)
938
+ end
939
+ end
940
+
941
+ def log_summary
942
+ total = @results.values.sum(&:size)
943
+ chunks = @results.sum { |_, units| units.sum { |u| u.chunks.size } }
944
+
945
+ Rails.logger.info '[Woods] ═══════════════════════════════════════════'
946
+ Rails.logger.info '[Woods] Extraction Complete'
947
+ Rails.logger.info '[Woods] ═══════════════════════════════════════════'
948
+ @results.each do |type, units|
949
+ Rails.logger.info "[Woods] #{type}: #{units.size} units"
950
+ end
951
+ Rails.logger.info '[Woods] ───────────────────────────────────────────'
952
+ Rails.logger.info "[Woods] Total: #{total} units, #{chunks} chunks"
953
+ Rails.logger.info "[Woods] Output: #{@output_dir}"
954
+ Rails.logger.info '[Woods] ═══════════════════════════════════════════'
955
+
956
+ all_warnings = @extractors.flat_map do |_type, ext|
957
+ ext.respond_to?(:warnings) ? ext.warnings : []
958
+ end
959
+
960
+ return if all_warnings.empty?
961
+
962
+ Rails.logger.warn '[Woods] ───────────────────────────────────────────'
963
+ Rails.logger.warn "[Woods] Warnings (#{all_warnings.size}):"
964
+ all_warnings.each { |w| Rails.logger.warn "[Woods] #{w}" }
965
+ end
966
+
967
+ # ──────────────────────────────────────────────────────────────────────
968
+ # Incremental Re-extraction
969
+ # ──────────────────────────────────────────────────────────────────────
970
+
971
+ def re_extract_unit(unit_id, affected_types: nil)
972
+ # Framework source only changes on version updates
973
+ if unit_id.start_with?('rails/') || unit_id.start_with?('gems/')
974
+ Rails.logger.debug "[Woods] Skipping framework re-extraction for #{unit_id}"
975
+ return
976
+ end
977
+
978
+ # Find the unit's type from the graph
979
+ node = @dependency_graph.to_h[:nodes][unit_id]
980
+ return unless node
981
+
982
+ type = node[:type]&.to_sym
983
+ file_path = node[:file_path]
984
+
985
+ return unless file_path && File.exist?(file_path)
986
+
987
+ # Re-extract based on type
988
+ extractor_key = TYPE_TO_EXTRACTOR_KEY[type]
989
+ return unless extractor_key
990
+
991
+ extractor = EXTRACTORS[extractor_key]&.new
992
+ return unless extractor
993
+
994
+ unit = if (method = CLASS_BASED[type])
995
+ klass = if unit_id.match?(/\A[A-Z][A-Za-z0-9_:]*\z/)
996
+ begin
997
+ unit_id.constantize
998
+ rescue StandardError
999
+ nil
1000
+ end
1001
+ end
1002
+ extractor.public_send(method, klass) if klass
1003
+ elsif (method = FILE_BASED[type])
1004
+ extractor.public_send(method, file_path)
1005
+ elsif GRAPHQL_TYPES.include?(type)
1006
+ extractor.extract_graphql_file(file_path)
1007
+ end
1008
+
1009
+ return unless unit
1010
+
1011
+ # Update dependency graph
1012
+ @dependency_graph.register(unit)
1013
+
1014
+ # Track which type was affected
1015
+ affected_types&.add(extractor_key)
1016
+
1017
+ # Write updated unit
1018
+ type_dir = @output_dir.join(extractor_key.to_s)
1019
+
1020
+ File.write(
1021
+ type_dir.join(collision_safe_filename(unit.identifier)),
1022
+ json_serialize(unit.to_h)
1023
+ )
1024
+
1025
+ Rails.logger.info "[Woods] Re-extracted #{unit_id}"
1026
+ end
1027
+ end
1028
+ end