woods 1.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (185) hide show
  1. checksums.yaml +7 -0
  2. data/CHANGELOG.md +89 -0
  3. data/CODE_OF_CONDUCT.md +83 -0
  4. data/CONTRIBUTING.md +65 -0
  5. data/LICENSE.txt +21 -0
  6. data/README.md +406 -0
  7. data/exe/woods-console +59 -0
  8. data/exe/woods-console-mcp +22 -0
  9. data/exe/woods-mcp +34 -0
  10. data/exe/woods-mcp-http +37 -0
  11. data/exe/woods-mcp-start +58 -0
  12. data/lib/generators/woods/install_generator.rb +32 -0
  13. data/lib/generators/woods/pgvector_generator.rb +37 -0
  14. data/lib/generators/woods/templates/add_pgvector_to_woods.rb.erb +15 -0
  15. data/lib/generators/woods/templates/create_woods_tables.rb.erb +43 -0
  16. data/lib/tasks/woods.rake +621 -0
  17. data/lib/tasks/woods_evaluation.rake +115 -0
  18. data/lib/woods/ast/call_site_extractor.rb +106 -0
  19. data/lib/woods/ast/method_extractor.rb +71 -0
  20. data/lib/woods/ast/node.rb +116 -0
  21. data/lib/woods/ast/parser.rb +614 -0
  22. data/lib/woods/ast.rb +6 -0
  23. data/lib/woods/builder.rb +200 -0
  24. data/lib/woods/cache/cache_middleware.rb +199 -0
  25. data/lib/woods/cache/cache_store.rb +264 -0
  26. data/lib/woods/cache/redis_cache_store.rb +116 -0
  27. data/lib/woods/cache/solid_cache_store.rb +111 -0
  28. data/lib/woods/chunking/chunk.rb +84 -0
  29. data/lib/woods/chunking/semantic_chunker.rb +295 -0
  30. data/lib/woods/console/adapters/cache_adapter.rb +58 -0
  31. data/lib/woods/console/adapters/good_job_adapter.rb +33 -0
  32. data/lib/woods/console/adapters/job_adapter.rb +68 -0
  33. data/lib/woods/console/adapters/sidekiq_adapter.rb +33 -0
  34. data/lib/woods/console/adapters/solid_queue_adapter.rb +33 -0
  35. data/lib/woods/console/audit_logger.rb +75 -0
  36. data/lib/woods/console/bridge.rb +177 -0
  37. data/lib/woods/console/confirmation.rb +90 -0
  38. data/lib/woods/console/connection_manager.rb +173 -0
  39. data/lib/woods/console/console_response_renderer.rb +74 -0
  40. data/lib/woods/console/embedded_executor.rb +373 -0
  41. data/lib/woods/console/model_validator.rb +81 -0
  42. data/lib/woods/console/rack_middleware.rb +87 -0
  43. data/lib/woods/console/safe_context.rb +82 -0
  44. data/lib/woods/console/server.rb +612 -0
  45. data/lib/woods/console/sql_validator.rb +172 -0
  46. data/lib/woods/console/tools/tier1.rb +118 -0
  47. data/lib/woods/console/tools/tier2.rb +117 -0
  48. data/lib/woods/console/tools/tier3.rb +110 -0
  49. data/lib/woods/console/tools/tier4.rb +79 -0
  50. data/lib/woods/coordination/pipeline_lock.rb +109 -0
  51. data/lib/woods/cost_model/embedding_cost.rb +88 -0
  52. data/lib/woods/cost_model/estimator.rb +128 -0
  53. data/lib/woods/cost_model/provider_pricing.rb +67 -0
  54. data/lib/woods/cost_model/storage_cost.rb +52 -0
  55. data/lib/woods/cost_model.rb +22 -0
  56. data/lib/woods/db/migrations/001_create_units.rb +38 -0
  57. data/lib/woods/db/migrations/002_create_edges.rb +35 -0
  58. data/lib/woods/db/migrations/003_create_embeddings.rb +37 -0
  59. data/lib/woods/db/migrations/004_create_snapshots.rb +45 -0
  60. data/lib/woods/db/migrations/005_create_snapshot_units.rb +40 -0
  61. data/lib/woods/db/migrations/006_rename_tables.rb +34 -0
  62. data/lib/woods/db/migrator.rb +73 -0
  63. data/lib/woods/db/schema_version.rb +73 -0
  64. data/lib/woods/dependency_graph.rb +236 -0
  65. data/lib/woods/embedding/indexer.rb +140 -0
  66. data/lib/woods/embedding/openai.rb +126 -0
  67. data/lib/woods/embedding/provider.rb +162 -0
  68. data/lib/woods/embedding/text_preparer.rb +112 -0
  69. data/lib/woods/evaluation/baseline_runner.rb +115 -0
  70. data/lib/woods/evaluation/evaluator.rb +139 -0
  71. data/lib/woods/evaluation/metrics.rb +79 -0
  72. data/lib/woods/evaluation/query_set.rb +148 -0
  73. data/lib/woods/evaluation/report_generator.rb +90 -0
  74. data/lib/woods/extracted_unit.rb +145 -0
  75. data/lib/woods/extractor.rb +1028 -0
  76. data/lib/woods/extractors/action_cable_extractor.rb +201 -0
  77. data/lib/woods/extractors/ast_source_extraction.rb +46 -0
  78. data/lib/woods/extractors/behavioral_profile.rb +309 -0
  79. data/lib/woods/extractors/caching_extractor.rb +261 -0
  80. data/lib/woods/extractors/callback_analyzer.rb +246 -0
  81. data/lib/woods/extractors/concern_extractor.rb +292 -0
  82. data/lib/woods/extractors/configuration_extractor.rb +219 -0
  83. data/lib/woods/extractors/controller_extractor.rb +404 -0
  84. data/lib/woods/extractors/database_view_extractor.rb +278 -0
  85. data/lib/woods/extractors/decorator_extractor.rb +253 -0
  86. data/lib/woods/extractors/engine_extractor.rb +223 -0
  87. data/lib/woods/extractors/event_extractor.rb +211 -0
  88. data/lib/woods/extractors/factory_extractor.rb +289 -0
  89. data/lib/woods/extractors/graphql_extractor.rb +892 -0
  90. data/lib/woods/extractors/i18n_extractor.rb +117 -0
  91. data/lib/woods/extractors/job_extractor.rb +374 -0
  92. data/lib/woods/extractors/lib_extractor.rb +218 -0
  93. data/lib/woods/extractors/mailer_extractor.rb +269 -0
  94. data/lib/woods/extractors/manager_extractor.rb +188 -0
  95. data/lib/woods/extractors/middleware_extractor.rb +133 -0
  96. data/lib/woods/extractors/migration_extractor.rb +469 -0
  97. data/lib/woods/extractors/model_extractor.rb +988 -0
  98. data/lib/woods/extractors/phlex_extractor.rb +252 -0
  99. data/lib/woods/extractors/policy_extractor.rb +191 -0
  100. data/lib/woods/extractors/poro_extractor.rb +229 -0
  101. data/lib/woods/extractors/pundit_extractor.rb +223 -0
  102. data/lib/woods/extractors/rails_source_extractor.rb +473 -0
  103. data/lib/woods/extractors/rake_task_extractor.rb +343 -0
  104. data/lib/woods/extractors/route_extractor.rb +181 -0
  105. data/lib/woods/extractors/scheduled_job_extractor.rb +331 -0
  106. data/lib/woods/extractors/serializer_extractor.rb +339 -0
  107. data/lib/woods/extractors/service_extractor.rb +217 -0
  108. data/lib/woods/extractors/shared_dependency_scanner.rb +91 -0
  109. data/lib/woods/extractors/shared_utility_methods.rb +281 -0
  110. data/lib/woods/extractors/state_machine_extractor.rb +398 -0
  111. data/lib/woods/extractors/test_mapping_extractor.rb +225 -0
  112. data/lib/woods/extractors/validator_extractor.rb +211 -0
  113. data/lib/woods/extractors/view_component_extractor.rb +311 -0
  114. data/lib/woods/extractors/view_template_extractor.rb +261 -0
  115. data/lib/woods/feedback/gap_detector.rb +89 -0
  116. data/lib/woods/feedback/store.rb +119 -0
  117. data/lib/woods/filename_utils.rb +32 -0
  118. data/lib/woods/flow_analysis/operation_extractor.rb +206 -0
  119. data/lib/woods/flow_analysis/response_code_mapper.rb +154 -0
  120. data/lib/woods/flow_assembler.rb +290 -0
  121. data/lib/woods/flow_document.rb +191 -0
  122. data/lib/woods/flow_precomputer.rb +102 -0
  123. data/lib/woods/formatting/base.rb +30 -0
  124. data/lib/woods/formatting/claude_adapter.rb +98 -0
  125. data/lib/woods/formatting/generic_adapter.rb +56 -0
  126. data/lib/woods/formatting/gpt_adapter.rb +64 -0
  127. data/lib/woods/formatting/human_adapter.rb +78 -0
  128. data/lib/woods/graph_analyzer.rb +374 -0
  129. data/lib/woods/mcp/bootstrapper.rb +96 -0
  130. data/lib/woods/mcp/index_reader.rb +394 -0
  131. data/lib/woods/mcp/renderers/claude_renderer.rb +81 -0
  132. data/lib/woods/mcp/renderers/json_renderer.rb +17 -0
  133. data/lib/woods/mcp/renderers/markdown_renderer.rb +353 -0
  134. data/lib/woods/mcp/renderers/plain_renderer.rb +240 -0
  135. data/lib/woods/mcp/server.rb +962 -0
  136. data/lib/woods/mcp/tool_response_renderer.rb +85 -0
  137. data/lib/woods/model_name_cache.rb +51 -0
  138. data/lib/woods/notion/client.rb +217 -0
  139. data/lib/woods/notion/exporter.rb +219 -0
  140. data/lib/woods/notion/mapper.rb +40 -0
  141. data/lib/woods/notion/mappers/column_mapper.rb +57 -0
  142. data/lib/woods/notion/mappers/migration_mapper.rb +39 -0
  143. data/lib/woods/notion/mappers/model_mapper.rb +161 -0
  144. data/lib/woods/notion/mappers/shared.rb +22 -0
  145. data/lib/woods/notion/rate_limiter.rb +68 -0
  146. data/lib/woods/observability/health_check.rb +79 -0
  147. data/lib/woods/observability/instrumentation.rb +34 -0
  148. data/lib/woods/observability/structured_logger.rb +57 -0
  149. data/lib/woods/operator/error_escalator.rb +81 -0
  150. data/lib/woods/operator/pipeline_guard.rb +92 -0
  151. data/lib/woods/operator/status_reporter.rb +80 -0
  152. data/lib/woods/railtie.rb +38 -0
  153. data/lib/woods/resilience/circuit_breaker.rb +99 -0
  154. data/lib/woods/resilience/index_validator.rb +167 -0
  155. data/lib/woods/resilience/retryable_provider.rb +108 -0
  156. data/lib/woods/retrieval/context_assembler.rb +261 -0
  157. data/lib/woods/retrieval/query_classifier.rb +133 -0
  158. data/lib/woods/retrieval/ranker.rb +277 -0
  159. data/lib/woods/retrieval/search_executor.rb +316 -0
  160. data/lib/woods/retriever.rb +152 -0
  161. data/lib/woods/ruby_analyzer/class_analyzer.rb +170 -0
  162. data/lib/woods/ruby_analyzer/dataflow_analyzer.rb +77 -0
  163. data/lib/woods/ruby_analyzer/fqn_builder.rb +18 -0
  164. data/lib/woods/ruby_analyzer/mermaid_renderer.rb +280 -0
  165. data/lib/woods/ruby_analyzer/method_analyzer.rb +143 -0
  166. data/lib/woods/ruby_analyzer/trace_enricher.rb +143 -0
  167. data/lib/woods/ruby_analyzer.rb +87 -0
  168. data/lib/woods/session_tracer/file_store.rb +104 -0
  169. data/lib/woods/session_tracer/middleware.rb +143 -0
  170. data/lib/woods/session_tracer/redis_store.rb +106 -0
  171. data/lib/woods/session_tracer/session_flow_assembler.rb +254 -0
  172. data/lib/woods/session_tracer/session_flow_document.rb +223 -0
  173. data/lib/woods/session_tracer/solid_cache_store.rb +139 -0
  174. data/lib/woods/session_tracer/store.rb +81 -0
  175. data/lib/woods/storage/graph_store.rb +120 -0
  176. data/lib/woods/storage/metadata_store.rb +196 -0
  177. data/lib/woods/storage/pgvector.rb +195 -0
  178. data/lib/woods/storage/qdrant.rb +205 -0
  179. data/lib/woods/storage/vector_store.rb +167 -0
  180. data/lib/woods/temporal/json_snapshot_store.rb +245 -0
  181. data/lib/woods/temporal/snapshot_store.rb +345 -0
  182. data/lib/woods/token_utils.rb +19 -0
  183. data/lib/woods/version.rb +5 -0
  184. data/lib/woods.rb +246 -0
  185. metadata +270 -0
@@ -0,0 +1,34 @@
1
+ # frozen_string_literal: true
2
+
3
+ module Woods
4
+ module Db
5
+ module Migrations
6
+ # Renames codebase_* tables to woods_* as part of the gem rename.
7
+ module RenameTables
8
+ VERSION = 6
9
+
10
+ # @param connection [Object] Database connection
11
+ # @return [void]
12
+ def self.up(connection)
13
+ renames = {
14
+ 'codebase_units' => 'woods_units',
15
+ 'codebase_edges' => 'woods_edges',
16
+ 'codebase_embeddings' => 'woods_embeddings',
17
+ 'codebase_snapshots' => 'woods_snapshots',
18
+ 'codebase_snapshot_units' => 'woods_snapshot_units'
19
+ }
20
+
21
+ renames.each do |old_name, new_name|
22
+ # Only rename if the old table exists (fresh installs won't have it)
23
+ result = connection.execute(
24
+ "SELECT name FROM sqlite_master WHERE type='table' AND name='#{old_name}'"
25
+ )
26
+ next if result.empty?
27
+
28
+ connection.execute("ALTER TABLE #{old_name} RENAME TO #{new_name}")
29
+ end
30
+ end
31
+ end
32
+ end
33
+ end
34
+ end
@@ -0,0 +1,73 @@
1
+ # frozen_string_literal: true
2
+
3
+ require_relative 'schema_version'
4
+ require_relative 'migrations/001_create_units'
5
+ require_relative 'migrations/002_create_edges'
6
+ require_relative 'migrations/003_create_embeddings'
7
+ require_relative 'migrations/004_create_snapshots'
8
+ require_relative 'migrations/005_create_snapshot_units'
9
+ require_relative 'migrations/006_rename_tables'
10
+
11
+ module Woods
12
+ module Db
13
+ # Runs schema migrations against a database connection.
14
+ #
15
+ # Tracks applied migrations via {SchemaVersion} and only runs pending ones.
16
+ # Migrations are defined as modules in `db/migrations/` with a VERSION
17
+ # constant and a `.up(connection)` class method.
18
+ #
19
+ # @example
20
+ # db = SQLite3::Database.new('woods.db')
21
+ # migrator = Migrator.new(connection: db)
22
+ # migrator.migrate! # => [1, 2, 3]
23
+ #
24
+ class Migrator
25
+ MIGRATIONS = [
26
+ Migrations::CreateUnits,
27
+ Migrations::CreateEdges,
28
+ Migrations::CreateEmbeddings,
29
+ Migrations::CreateSnapshots,
30
+ Migrations::CreateSnapshotUnits,
31
+ Migrations::RenameTables
32
+ ].freeze
33
+
34
+ attr_reader :schema_version
35
+
36
+ # @param connection [Object] Database connection supporting #execute
37
+ def initialize(connection:)
38
+ @connection = connection
39
+ @schema_version = SchemaVersion.new(connection: connection)
40
+ @schema_version.ensure_table!
41
+ end
42
+
43
+ # Run all pending migrations.
44
+ #
45
+ # @return [Array<Integer>] Version numbers of newly applied migrations
46
+ def migrate!
47
+ applied = []
48
+ pending_migrations.each do |migration|
49
+ migration.up(@connection)
50
+ @schema_version.record_version(migration::VERSION)
51
+ applied << migration::VERSION
52
+ end
53
+ applied
54
+ end
55
+
56
+ # List version numbers of pending (unapplied) migrations.
57
+ #
58
+ # @return [Array<Integer>]
59
+ def pending_versions
60
+ applied = @schema_version.applied_versions
61
+ MIGRATIONS.map { |m| m::VERSION }.reject { |v| applied.include?(v) }
62
+ end
63
+
64
+ private
65
+
66
+ # @return [Array<Module>] Pending migration modules
67
+ def pending_migrations
68
+ applied = @schema_version.applied_versions
69
+ MIGRATIONS.reject { |m| applied.include?(m::VERSION) }
70
+ end
71
+ end
72
+ end
73
+ end
@@ -0,0 +1,73 @@
1
+ # frozen_string_literal: true
2
+
3
+ module Woods
4
+ module Db
5
+ # Tracks which schema migrations have been applied.
6
+ #
7
+ # Uses a simple `woods_schema_migrations` table with a single
8
+ # `version` column. Works with any database connection that supports
9
+ # `execute` and returns arrays (SQLite3, pg, mysql2).
10
+ #
11
+ # @example
12
+ # db = SQLite3::Database.new('woods.db')
13
+ # sv = SchemaVersion.new(connection: db)
14
+ # sv.ensure_table!
15
+ # sv.current_version # => 0
16
+ # sv.record_version(1)
17
+ # sv.current_version # => 1
18
+ #
19
+ class SchemaVersion
20
+ TABLE_NAME = 'woods_schema_migrations'
21
+
22
+ # @param connection [Object] Database connection supporting #execute
23
+ def initialize(connection:)
24
+ @connection = connection
25
+ end
26
+
27
+ # Create the schema migrations table if it does not exist.
28
+ #
29
+ # @return [void]
30
+ def ensure_table!
31
+ @connection.execute(<<~SQL)
32
+ CREATE TABLE IF NOT EXISTS #{TABLE_NAME} (
33
+ version INTEGER PRIMARY KEY NOT NULL,
34
+ applied_at TEXT NOT NULL DEFAULT (datetime('now'))
35
+ )
36
+ SQL
37
+ end
38
+
39
+ # List all applied migration version numbers, sorted ascending.
40
+ #
41
+ # @return [Array<Integer>]
42
+ def applied_versions
43
+ rows = @connection.execute("SELECT version FROM #{TABLE_NAME} ORDER BY version ASC")
44
+ rows.map { |row| row.is_a?(Array) ? row[0] : row['version'] }
45
+ end
46
+
47
+ # Record a migration version as applied.
48
+ #
49
+ # @param version [Integer] The migration version number
50
+ # @return [void]
51
+ def record_version(version)
52
+ @connection.execute(
53
+ "INSERT OR IGNORE INTO #{TABLE_NAME} (version) VALUES (?)", [version]
54
+ )
55
+ end
56
+
57
+ # Check whether a version has been applied.
58
+ #
59
+ # @param version [Integer]
60
+ # @return [Boolean]
61
+ def applied?(version)
62
+ applied_versions.include?(version)
63
+ end
64
+
65
+ # The highest applied version, or 0 if none.
66
+ #
67
+ # @return [Integer]
68
+ def current_version
69
+ applied_versions.last || 0
70
+ end
71
+ end
72
+ end
73
+ end
@@ -0,0 +1,236 @@
1
+ # frozen_string_literal: true
2
+
3
+ require 'set'
4
+ require 'json'
5
+
6
+ module Woods
7
+ # DependencyGraph tracks relationships between code units for:
8
+ # 1. Understanding what depends on what
9
+ # 2. Computing "blast radius" for incremental re-indexing
10
+ # 3. Enabling graph-based retrieval queries
11
+ #
12
+ # The graph is bidirectional - we track both what a unit depends on
13
+ # and what depends on that unit (reverse edges).
14
+ #
15
+ # @example Building and querying the graph
16
+ # graph = DependencyGraph.new
17
+ # graph.register(user_model_unit)
18
+ # graph.register(user_service_unit)
19
+ #
20
+ # # Find everything affected by a change to user.rb
21
+ # affected = graph.affected_by(["app/models/user.rb"])
22
+ #
23
+ class DependencyGraph
24
+ def initialize
25
+ @nodes = {} # identifier => { type:, file_path: }
26
+ @edges = {} # identifier => [dependency identifiers]
27
+ @reverse = {} # identifier => Set of dependent identifiers
28
+ @file_map = {} # file_path => identifier
29
+ @type_index = {} # type => Set of identifiers
30
+ @to_h = nil
31
+ end
32
+
33
+ # Register a unit in the graph
34
+ #
35
+ # @param unit [ExtractedUnit] The unit to register
36
+ def register(unit)
37
+ @to_h = nil
38
+
39
+ @nodes[unit.identifier] = {
40
+ type: unit.type,
41
+ file_path: unit.file_path,
42
+ namespace: unit.namespace
43
+ }
44
+
45
+ @edges[unit.identifier] = unit.dependencies.map { |d| d[:target] }
46
+ @file_map[unit.file_path] = unit.identifier if unit.file_path
47
+
48
+ # Type index for filtering (Set-based for O(1) insert)
49
+ (@type_index[unit.type] ||= Set.new).add(unit.identifier)
50
+
51
+ # Build reverse edges (Set-based for O(1) insert)
52
+ unit.dependencies.each do |dep|
53
+ (@reverse[dep[:target]] ||= Set.new).add(unit.identifier)
54
+ end
55
+ end
56
+
57
+ # Find all units affected by changes to given files
58
+ # Uses BFS to find transitive dependents
59
+ #
60
+ # @param changed_files [Array<String>] List of changed file paths
61
+ # @param max_depth [Integer] Maximum traversal depth (nil for unlimited)
62
+ # @return [Array<String>] List of affected unit identifiers
63
+ def affected_by(changed_files, max_depth: nil)
64
+ directly_changed = changed_files.filter_map { |f| @file_map[f] }
65
+
66
+ affected = Set.new(directly_changed)
67
+ queue = directly_changed.map { |id| [id, 0] } # [identifier, depth]
68
+
69
+ while queue.any?
70
+ current, depth = queue.shift
71
+ next if max_depth && depth >= max_depth
72
+
73
+ dependents = @reverse[current] || []
74
+
75
+ dependents.each do |dep|
76
+ unless affected.include?(dep)
77
+ affected.add(dep)
78
+ queue.push([dep, depth + 1])
79
+ end
80
+ end
81
+ end
82
+
83
+ affected.to_a
84
+ end
85
+
86
+ # Check if a node exists in the graph by exact identifier.
87
+ #
88
+ # @param identifier [String] Unit identifier to check
89
+ # @return [Boolean] true if the node exists
90
+ def node_exists?(identifier)
91
+ @nodes.key?(identifier)
92
+ end
93
+
94
+ # Find a node by suffix matching (e.g., "Update" matches "Order::Update").
95
+ #
96
+ # When multiple nodes share the same suffix, the first match wins.
97
+ # Suffix matching requires a "::" separator — bare identifiers (no namespace)
98
+ # are not matched by this method; use {#node_exists?} for exact lookups.
99
+ #
100
+ # @param suffix [String] The suffix to match against
101
+ # @return [String, nil] The first matching identifier, or nil
102
+ def find_node_by_suffix(suffix)
103
+ target_suffix = "::#{suffix}"
104
+ @nodes.keys.find { |id| id.end_with?(target_suffix) }
105
+ end
106
+
107
+ # Get direct dependencies of a unit
108
+ #
109
+ # @param identifier [String] Unit identifier
110
+ # @return [Array<String>] List of dependency identifiers
111
+ def dependencies_of(identifier)
112
+ @edges[identifier] || []
113
+ end
114
+
115
+ # Get direct dependents of a unit (what depends on it)
116
+ #
117
+ # @param identifier [String] Unit identifier
118
+ # @return [Array<String>] List of dependent identifiers
119
+ def dependents_of(identifier)
120
+ @reverse.fetch(identifier, Set.new).to_a
121
+ end
122
+
123
+ # Get all units of a specific type
124
+ #
125
+ # @param type [Symbol] Unit type (:model, :controller, etc.)
126
+ # @return [Array<String>] List of unit identifiers
127
+ def units_of_type(type)
128
+ @type_index.fetch(type, Set.new).to_a
129
+ end
130
+
131
+ # Compute PageRank scores for all nodes
132
+ #
133
+ # Uses the reverse edges (dependents) as the link structure: a node
134
+ # with many dependents gets a higher score. This matches Aider's insight
135
+ # that structural importance correlates with retrieval relevance.
136
+ #
137
+ # @param damping [Float] Damping factor (default: 0.85)
138
+ # @param iterations [Integer] Number of iterations (default: 20)
139
+ # @return [Hash<String, Float>] Identifier => PageRank score
140
+ def pagerank(damping: 0.85, iterations: 20)
141
+ n = @nodes.size
142
+ return {} if n.zero?
143
+
144
+ node_ids = @nodes.keys
145
+ base_score = 1.0 / n
146
+ scores = node_ids.to_h { |id| [id, base_score] }
147
+
148
+ iterations.times do
149
+ # Collect rank from dangling nodes (no outgoing edges) and redistribute
150
+ dangling_sum = node_ids.sum do |id|
151
+ @edges[id].nil? || @edges[id].empty? ? scores[id] : 0.0
152
+ end
153
+
154
+ new_scores = {}
155
+
156
+ node_ids.each do |id|
157
+ # Sum contributions from nodes that depend on this one
158
+ incoming = @reverse[id] || []
159
+ rank_sum = incoming.sum do |src|
160
+ out_degree = (@edges[src] || []).size
161
+ out_degree.positive? ? scores[src] / out_degree : 0.0
162
+ end
163
+
164
+ new_scores[id] = ((1.0 - damping) / n) + (damping * (rank_sum + (dangling_sum / n)))
165
+ end
166
+
167
+ scores = new_scores
168
+ end
169
+
170
+ scores
171
+ end
172
+
173
+ # Serialize graph for persistence. Memoized — cache is invalidated on register.
174
+ # Returns a dup so callers can't pollute the cached hash.
175
+ #
176
+ # @return [Hash] Complete graph data
177
+ def to_h
178
+ @to_h ||= {
179
+ nodes: @nodes,
180
+ edges: @edges,
181
+ reverse: @reverse.transform_values(&:to_a),
182
+ file_map: @file_map,
183
+ type_index: @type_index.transform_values(&:to_a),
184
+ stats: {
185
+ node_count: @nodes.size,
186
+ edge_count: @edges.values.sum(&:size),
187
+ types: @type_index.transform_values(&:size)
188
+ }
189
+ }
190
+ @to_h.dup
191
+ end
192
+
193
+ # Load graph from persisted data
194
+ #
195
+ # After JSON round-trip all keys become strings. This method normalizes
196
+ # them back to the expected types: node values use symbol keys (:type,
197
+ # :file_path, :namespace), and type_index uses symbol keys for types.
198
+ #
199
+ # @param data [Hash] Previously serialized graph data
200
+ # @return [DependencyGraph] Restored graph
201
+ def self.from_h(data)
202
+ graph = new
203
+
204
+ raw_nodes = data[:nodes] || data['nodes'] || {}
205
+ graph.instance_variable_set(:@nodes, raw_nodes.transform_values { |v| symbolize_node(v) })
206
+
207
+ graph.instance_variable_set(:@edges, data[:edges] || data['edges'] || {})
208
+
209
+ raw_reverse = data[:reverse] || data['reverse'] || {}
210
+ graph.instance_variable_set(:@reverse, raw_reverse.transform_values { |v| v.is_a?(Set) ? v : Set.new(v) })
211
+
212
+ graph.instance_variable_set(:@file_map, data[:file_map] || data['file_map'] || {})
213
+
214
+ raw_type_index = data[:type_index] || data['type_index'] || {}
215
+ graph.instance_variable_set(:@type_index, raw_type_index.transform_keys(&:to_sym).transform_values do |v|
216
+ v.is_a?(Set) ? v : Set.new(v)
217
+ end)
218
+
219
+ graph
220
+ end
221
+
222
+ # Normalize a node hash to use symbol keys
223
+ #
224
+ # @param node [Hash] Node data with string or symbol keys
225
+ # @return [Hash] Node data with symbol keys
226
+ def self.symbolize_node(node)
227
+ return node unless node.is_a?(Hash)
228
+
229
+ {
230
+ type: (node[:type] || node['type'])&.to_sym,
231
+ file_path: node[:file_path] || node['file_path'],
232
+ namespace: node[:namespace] || node['namespace']
233
+ }
234
+ end
235
+ end
236
+ end
@@ -0,0 +1,140 @@
1
+ # frozen_string_literal: true
2
+
3
+ require 'json'
4
+ require 'digest'
5
+
6
+ module Woods
7
+ module Embedding
8
+ # Orchestrates the indexing pipeline: reads extracted units, prepares text,
9
+ # generates embeddings, and stores vectors. Supports full and incremental
10
+ # modes with checkpoint-based resumability.
11
+ class Indexer
12
+ # @param checkpoint_interval [Integer] Save checkpoint every N batches (default: 10)
13
+ def initialize(provider:, text_preparer:, vector_store:, output_dir:, batch_size: 32, checkpoint_interval: 10) # rubocop:disable Metrics/ParameterLists
14
+ @provider = provider
15
+ @text_preparer = text_preparer
16
+ @vector_store = vector_store
17
+ @output_dir = output_dir
18
+ @batch_size = batch_size
19
+ @checkpoint_interval = checkpoint_interval
20
+ end
21
+
22
+ # Index all extracted units (full mode). Returns stats hash.
23
+ # @return [Hash] Stats with :processed, :skipped, :errors counts
24
+ def index_all
25
+ process_units(load_units, incremental: false)
26
+ end
27
+
28
+ # Index only changed units (incremental mode). Returns stats hash.
29
+ # @return [Hash] Stats with :processed, :skipped, :errors counts
30
+ def index_incremental
31
+ process_units(load_units, incremental: true)
32
+ end
33
+
34
+ private
35
+
36
+ def load_units
37
+ Dir.glob(File.join(@output_dir, '**', '*.json')).filter_map do |path|
38
+ next if File.basename(path) == 'checkpoint.json'
39
+
40
+ JSON.parse(File.read(path))
41
+ rescue JSON::ParserError
42
+ nil
43
+ end
44
+ end
45
+
46
+ def process_units(units, incremental:)
47
+ checkpoint = incremental ? load_checkpoint : {}
48
+ stats = { processed: 0, skipped: 0, errors: 0 }
49
+ batch_count = 0
50
+
51
+ units.each_slice(@batch_size) do |batch|
52
+ process_batch(batch, checkpoint, stats, incremental: incremental)
53
+ batch_count += 1
54
+ save_checkpoint(checkpoint) if (batch_count % @checkpoint_interval).zero?
55
+ end
56
+
57
+ # Always save final checkpoint
58
+ save_checkpoint(checkpoint)
59
+
60
+ stats
61
+ end
62
+
63
+ def process_batch(batch, checkpoint, stats, incremental:)
64
+ to_embed = batch.each_with_object([]) do |unit_data, items|
65
+ if incremental && checkpoint[unit_data['identifier']] == unit_data['source_hash']
66
+ stats[:skipped] += 1
67
+ next
68
+ end
69
+ collect_embed_items(unit_data, items)
70
+ end
71
+
72
+ embed_and_store(to_embed, checkpoint, stats)
73
+ end
74
+
75
+ def collect_embed_items(unit_data, items)
76
+ texts = prepare_texts(unit_data)
77
+ identifier = unit_data['identifier']
78
+
79
+ texts.each_with_index do |text, idx|
80
+ embed_id = texts.length > 1 ? "#{identifier}#chunk_#{idx}" : identifier
81
+ items << { id: embed_id, text: text, unit_data: unit_data,
82
+ source_hash: unit_data['source_hash'], identifier: identifier }
83
+ end
84
+ end
85
+
86
+ def prepare_texts(unit_data)
87
+ unit = build_unit(unit_data)
88
+ unit.chunks&.any? ? @text_preparer.prepare_chunks(unit) : [@text_preparer.prepare(unit)]
89
+ end
90
+
91
+ def build_unit(data)
92
+ unit = ExtractedUnit.new(type: data['type']&.to_sym, identifier: data['identifier'],
93
+ file_path: data['file_path'])
94
+ unit.namespace = data['namespace']
95
+ unit.source_code = data['source_code']
96
+ unit.dependencies = data['dependencies'] || []
97
+ unit.chunks = (data['chunks'] || []).map { |c| c.transform_keys(&:to_sym) }
98
+ unit
99
+ end
100
+
101
+ def embed_and_store(items, checkpoint, stats)
102
+ return if items.empty?
103
+
104
+ vectors = @provider.embed_batch(items.map { |i| i[:text] })
105
+ store_vectors(items, vectors, checkpoint, stats)
106
+ rescue StandardError => e
107
+ stats[:errors] += items.size
108
+ raise Woods::Error, "Embedding failed: #{e.message}"
109
+ end
110
+
111
+ def store_vectors(items, vectors, checkpoint, stats)
112
+ entries = items.each_with_index.map do |item, idx|
113
+ { id: item[:id], vector: vectors[idx],
114
+ metadata: { type: item[:unit_data]['type'], identifier: item[:identifier],
115
+ file_path: item[:unit_data]['file_path'] } }
116
+ end
117
+
118
+ @vector_store.store_batch(entries)
119
+
120
+ items.each do |item|
121
+ checkpoint[item[:identifier]] = item[:source_hash]
122
+ stats[:processed] += 1
123
+ end
124
+ end
125
+
126
+ def load_checkpoint
127
+ path = File.join(@output_dir, 'checkpoint.json')
128
+ return {} unless File.exist?(path)
129
+
130
+ JSON.parse(File.read(path))
131
+ rescue JSON::ParserError
132
+ {}
133
+ end
134
+
135
+ def save_checkpoint(checkpoint)
136
+ File.write(File.join(@output_dir, 'checkpoint.json'), JSON.generate(checkpoint))
137
+ end
138
+ end
139
+ end
140
+ end
@@ -0,0 +1,126 @@
1
+ # frozen_string_literal: true
2
+
3
+ require 'net/http'
4
+ require 'json'
5
+
6
+ module Woods
7
+ module Embedding
8
+ module Provider
9
+ # OpenAI adapter for cloud embeddings via the OpenAI HTTP API.
10
+ #
11
+ # Uses the `/v1/embeddings` endpoint to generate embeddings. Requires a valid
12
+ # OpenAI API key.
13
+ #
14
+ # @example
15
+ # provider = Woods::Embedding::Provider::OpenAI.new(api_key: ENV['OPENAI_API_KEY'])
16
+ # vector = provider.embed("class User < ApplicationRecord; end")
17
+ # vectors = provider.embed_batch(["text1", "text2"])
18
+ class OpenAI
19
+ include Interface
20
+
21
+ ENDPOINT = URI('https://api.openai.com/v1/embeddings')
22
+ DEFAULT_MODEL = 'text-embedding-3-small'
23
+ DIMENSIONS = {
24
+ 'text-embedding-3-small' => 1536,
25
+ 'text-embedding-3-large' => 3072
26
+ }.freeze
27
+
28
+ # @param api_key [String] OpenAI API key
29
+ # @param model [String] OpenAI embedding model name (default: text-embedding-3-small)
30
+ def initialize(api_key:, model: DEFAULT_MODEL)
31
+ @api_key = api_key
32
+ @model = model
33
+ end
34
+
35
+ # Embed a single text string.
36
+ #
37
+ # @param text [String] the text to embed
38
+ # @return [Array<Float>] the embedding vector
39
+ # @raise [Woods::Error] if the API returns an error
40
+ def embed(text)
41
+ response = post_request({ model: @model, input: text })
42
+ response['data'].first['embedding']
43
+ end
44
+
45
+ # Embed multiple texts in a single request.
46
+ #
47
+ # Sorts results by the index field to guarantee ordering matches input.
48
+ #
49
+ # @param texts [Array<String>] the texts to embed
50
+ # @return [Array<Array<Float>>] array of embedding vectors
51
+ # @raise [Woods::Error] if the API returns an error
52
+ def embed_batch(texts)
53
+ response = post_request({ model: @model, input: texts })
54
+ response['data']
55
+ .sort_by { |item| item['index'] }
56
+ .map { |item| item['embedding'] }
57
+ end
58
+
59
+ # Return the dimensionality of vectors produced by this model.
60
+ #
61
+ # Uses the known dimensions for standard models, falling back to a
62
+ # test embedding for unknown models.
63
+ #
64
+ # @return [Integer] number of dimensions
65
+ def dimensions
66
+ DIMENSIONS[@model] || embed('test').length
67
+ end
68
+
69
+ # Return the model name.
70
+ #
71
+ # @return [String] the OpenAI model name
72
+ def model_name
73
+ @model
74
+ end
75
+
76
+ private
77
+
78
+ # Send a POST request to the OpenAI embeddings API.
79
+ #
80
+ # @param body [Hash] request body
81
+ # @return [Hash] parsed JSON response
82
+ # @raise [Woods::Error] if the API returns a non-success status
83
+ def post_request(body)
84
+ request = Net::HTTP::Post.new(ENDPOINT.path)
85
+ request['Content-Type'] = 'application/json'
86
+ request['Authorization'] = "Bearer #{@api_key}"
87
+ request.body = body.to_json
88
+
89
+ response = http_client.request(request)
90
+
91
+ unless response.is_a?(Net::HTTPSuccess)
92
+ raise Woods::Error, "OpenAI API error: #{response.code} #{response.body}"
93
+ end
94
+
95
+ JSON.parse(response.body)
96
+ rescue Errno::ECONNRESET, Net::OpenTimeout, IOError
97
+ # Connection dropped — reset and retry once
98
+ @http_client = nil
99
+ response = http_client.request(request)
100
+ unless response.is_a?(Net::HTTPSuccess)
101
+ raise Woods::Error, "OpenAI API error: #{response.code} #{response.body}"
102
+ end
103
+
104
+ JSON.parse(response.body)
105
+ end
106
+
107
+ # Return a reusable, started HTTP client for the OpenAI API.
108
+ # Calling http.start opens a persistent TCP connection so
109
+ # keep_alive_timeout actually takes effect across requests.
110
+ #
111
+ # @return [Net::HTTP]
112
+ def http_client
113
+ return @http_client if @http_client&.started?
114
+
115
+ http = Net::HTTP.new(ENDPOINT.host, ENDPOINT.port)
116
+ http.use_ssl = true
117
+ http.open_timeout = 10
118
+ http.read_timeout = 30
119
+ http.keep_alive_timeout = 30
120
+ http.start
121
+ @http_client = http
122
+ end
123
+ end
124
+ end
125
+ end
126
+ end