claude_memory 0.7.1 → 0.9.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (107) hide show
  1. checksums.yaml +4 -4
  2. data/.claude/memory.sqlite3 +0 -0
  3. data/.claude/rules/claude_memory.generated.md +32 -2
  4. data/.claude/settings.json +65 -15
  5. data/.claude/settings.local.json +5 -2
  6. data/.claude/skills/improve/SKILL.md +113 -25
  7. data/.claude/skills/upgrade-dependencies/SKILL.md +154 -0
  8. data/.claude-plugin/commands/distill-transcripts.md +98 -0
  9. data/.claude-plugin/commands/memory-recall.md +67 -0
  10. data/.claude-plugin/marketplace.json +2 -2
  11. data/.claude-plugin/plugin.json +3 -3
  12. data/.claude-plugin/scripts/hook-runner.sh +14 -0
  13. data/.claude-plugin/scripts/serve-mcp.sh +14 -0
  14. data/.ruby-version +1 -1
  15. data/CHANGELOG.md +90 -1
  16. data/CLAUDE.md +56 -18
  17. data/README.md +35 -0
  18. data/db/migrations/013_add_mcp_tool_calls.rb +26 -0
  19. data/db/migrations/014_canonicalize_predicates.rb +30 -0
  20. data/docs/improvements.md +74 -74
  21. data/docs/influence/claude-mem.md +1 -0
  22. data/docs/influence/claude-supermemory.md +1 -0
  23. data/docs/influence/episodic-memory.md +1 -0
  24. data/docs/influence/grepai.md +1 -0
  25. data/docs/influence/kbs.md +1 -0
  26. data/docs/influence/lossless-claw.md +1 -0
  27. data/docs/influence/qmd.md +1 -0
  28. data/docs/quality_review.md +119 -224
  29. data/hooks/hooks.json +39 -7
  30. data/lib/claude_memory/commands/checks/distill_check.rb +61 -0
  31. data/lib/claude_memory/commands/checks/hooks_check.rb +2 -2
  32. data/lib/claude_memory/commands/checks/vec_check.rb +2 -1
  33. data/lib/claude_memory/commands/completion_command.rb +149 -0
  34. data/lib/claude_memory/commands/doctor_command.rb +2 -0
  35. data/lib/claude_memory/commands/embeddings_command.rb +198 -0
  36. data/lib/claude_memory/commands/help_command.rb +12 -1
  37. data/lib/claude_memory/commands/hook_command.rb +2 -1
  38. data/lib/claude_memory/commands/index_command.rb +85 -78
  39. data/lib/claude_memory/commands/initializers/database_ensurer.rb +16 -0
  40. data/lib/claude_memory/commands/initializers/global_initializer.rb +2 -1
  41. data/lib/claude_memory/commands/initializers/hooks_configurator.rb +55 -11
  42. data/lib/claude_memory/commands/initializers/project_initializer.rb +2 -1
  43. data/lib/claude_memory/commands/install_skill_command.rb +78 -0
  44. data/lib/claude_memory/commands/registry.rb +47 -32
  45. data/lib/claude_memory/commands/reject_command.rb +62 -0
  46. data/lib/claude_memory/commands/restore_command.rb +77 -0
  47. data/lib/claude_memory/commands/skills/distill-transcripts.md +102 -0
  48. data/lib/claude_memory/commands/skills/memory-recall.md +67 -0
  49. data/lib/claude_memory/commands/stats_command.rb +98 -2
  50. data/lib/claude_memory/configuration.rb +14 -1
  51. data/lib/claude_memory/core/fact_ranker.rb +2 -2
  52. data/lib/claude_memory/core/rr_fusion.rb +23 -6
  53. data/lib/claude_memory/core/snippet_extractor.rb +7 -3
  54. data/lib/claude_memory/core/text_builder.rb +11 -0
  55. data/lib/claude_memory/distill/json_schema.md +8 -4
  56. data/lib/claude_memory/distill/null_distiller.rb +2 -0
  57. data/lib/claude_memory/domain/entity.rb +13 -1
  58. data/lib/claude_memory/domain/fact.rb +26 -2
  59. data/lib/claude_memory/domain/provenance.rb +0 -1
  60. data/lib/claude_memory/embeddings/api_adapter.rb +97 -0
  61. data/lib/claude_memory/embeddings/dimension_check.rb +23 -0
  62. data/lib/claude_memory/embeddings/fastembed_adapter.rb +46 -12
  63. data/lib/claude_memory/embeddings/generator.rb +4 -0
  64. data/lib/claude_memory/embeddings/inspector.rb +91 -0
  65. data/lib/claude_memory/embeddings/model_registry.rb +210 -0
  66. data/lib/claude_memory/embeddings/resolver.rb +44 -0
  67. data/lib/claude_memory/hook/context_injector.rb +58 -2
  68. data/lib/claude_memory/hook/distillation_runner.rb +46 -0
  69. data/lib/claude_memory/hook/handler.rb +11 -2
  70. data/lib/claude_memory/index/vector_index.rb +15 -2
  71. data/lib/claude_memory/infrastructure/schema_validator.rb +3 -3
  72. data/lib/claude_memory/ingest/ingester.rb +17 -0
  73. data/lib/claude_memory/mcp/handlers/context_handlers.rb +38 -0
  74. data/lib/claude_memory/mcp/handlers/management_handlers.rb +169 -0
  75. data/lib/claude_memory/mcp/handlers/query_handlers.rb +115 -0
  76. data/lib/claude_memory/mcp/handlers/setup_handlers.rb +211 -0
  77. data/lib/claude_memory/mcp/handlers/shortcut_handlers.rb +37 -0
  78. data/lib/claude_memory/mcp/handlers/stats_handlers.rb +205 -0
  79. data/lib/claude_memory/mcp/instructions_builder.rb +19 -1
  80. data/lib/claude_memory/mcp/query_guide.rb +10 -0
  81. data/lib/claude_memory/mcp/response_formatter.rb +1 -0
  82. data/lib/claude_memory/mcp/server.rb +22 -1
  83. data/lib/claude_memory/mcp/telemetry.rb +86 -0
  84. data/lib/claude_memory/mcp/text_summary.rb +26 -0
  85. data/lib/claude_memory/mcp/tool_definitions.rb +116 -4
  86. data/lib/claude_memory/mcp/tool_helpers.rb +43 -0
  87. data/lib/claude_memory/mcp/tools.rb +50 -679
  88. data/lib/claude_memory/publish.rb +40 -5
  89. data/lib/claude_memory/recall/dual_engine.rb +105 -0
  90. data/lib/claude_memory/recall/legacy_engine.rb +138 -0
  91. data/lib/claude_memory/recall/query_core.rb +371 -0
  92. data/lib/claude_memory/recall.rb +121 -673
  93. data/lib/claude_memory/resolve/predicate_policy.rb +63 -3
  94. data/lib/claude_memory/resolve/resolver.rb +43 -0
  95. data/lib/claude_memory/shortcuts.rb +4 -4
  96. data/lib/claude_memory/store/retry_handler.rb +61 -0
  97. data/lib/claude_memory/store/schema_manager.rb +68 -0
  98. data/lib/claude_memory/store/sqlite_store.rb +334 -201
  99. data/lib/claude_memory/store/store_manager.rb +50 -1
  100. data/lib/claude_memory/sweep/maintenance.rb +115 -1
  101. data/lib/claude_memory/sweep/sweeper.rb +3 -0
  102. data/lib/claude_memory/templates/hooks.example.json +26 -7
  103. data/lib/claude_memory/version.rb +1 -1
  104. data/lib/claude_memory.rb +16 -0
  105. metadata +48 -8
  106. data/.claude/memory.sqlite3-shm +0 -0
  107. data/.claude/memory.sqlite3-wal +0 -0
@@ -52,7 +52,8 @@ module ClaudeMemory
52
52
  stats = store.vector_index.coverage_stats
53
53
  totals[:with_embedding] += stats[:with_embedding]
54
54
  totals[:vec_indexed] += stats[:vec_indexed]
55
- rescue => _e
55
+ rescue => e
56
+ ClaudeMemory.logger.debug("VecCheck failed for #{db_path}: #{e.message}")
56
57
  next
57
58
  ensure
58
59
  store&.close
@@ -0,0 +1,149 @@
1
+ # frozen_string_literal: true
2
+
3
+ module ClaudeMemory
4
+ module Commands
5
+ # Generates shell completion scripts for bash and zsh.
6
+ # Outputs completion script to stdout for eval or redirection.
7
+ class CompletionCommand < BaseCommand
8
+ def call(args)
9
+ opts = parse_options(args, {shell: detect_shell}) do |o|
10
+ OptionParser.new do |parser|
11
+ parser.banner = "Usage: claude-memory completion [options]"
12
+ parser.on("--shell SHELL", %w[bash zsh], "Shell type: bash or zsh (auto-detected)") { |v| o[:shell] = v }
13
+ end
14
+ end
15
+ return 1 if opts.nil?
16
+
17
+ case opts[:shell]
18
+ when "zsh"
19
+ stdout.puts zsh_completion
20
+ when "bash"
21
+ stdout.puts bash_completion
22
+ else
23
+ return failure("Unknown shell: #{opts[:shell]}. Use --shell bash or --shell zsh")
24
+ end
25
+ 0
26
+ end
27
+
28
+ private
29
+
30
+ def detect_shell
31
+ shell = ENV.fetch("SHELL", "/bin/bash")
32
+ File.basename(shell)
33
+ end
34
+
35
+ def command_names
36
+ Registry.all_commands.sort
37
+ end
38
+
39
+ def zsh_completion
40
+ commands_with_desc = Registry.descriptions.sort.map { |name, desc|
41
+ " '#{name}:#{desc}'"
42
+ }.join("\n")
43
+
44
+ <<~ZSH
45
+ #compdef claude-memory
46
+
47
+ _claude_memory() {
48
+ local -a commands
49
+ commands=(
50
+ #{commands_with_desc}
51
+ )
52
+
53
+ _arguments -C \\
54
+ '1:command:->command' \\
55
+ '*::arg:->args'
56
+
57
+ case $state in
58
+ command)
59
+ _describe 'command' commands
60
+ ;;
61
+ args)
62
+ case $words[1] in
63
+ recall|search|explain)
64
+ _arguments '*:query:'
65
+ ;;
66
+ promote)
67
+ _arguments '*:fact_id:'
68
+ ;;
69
+ hook)
70
+ local -a subcommands
71
+ subcommands=('ingest:Ingest transcript' 'sweep:Run maintenance' 'publish:Publish snapshot' 'context:Inject context')
72
+ _describe 'subcommand' subcommands
73
+ ;;
74
+ compact|export|changes|stats|sweep|conflicts)
75
+ _arguments '--scope[Scope]:scope:(all global project)'
76
+ ;;
77
+ index)
78
+ _arguments '--vec[Build vector index]' '--rebuild[Rebuild from scratch]'
79
+ ;;
80
+ completion)
81
+ _arguments '--shell[Shell type]:shell:(bash zsh)'
82
+ ;;
83
+ install-skill)
84
+ local -a skills
85
+ skills=(#{skill_names.map { |s| "'#{s}'" }.join(" ")})
86
+ _arguments '--list[List available skills]' '--force[Overwrite existing]' '1:skill:($skills)'
87
+ ;;
88
+ esac
89
+ ;;
90
+ esac
91
+ }
92
+
93
+ _claude_memory "$@"
94
+ ZSH
95
+ end
96
+
97
+ def bash_completion
98
+ <<~BASH
99
+ # bash completion for claude-memory
100
+
101
+ _claude_memory() {
102
+ local cur prev commands
103
+ COMPREPLY=()
104
+ cur="${COMP_WORDS[COMP_CWORD]}"
105
+ prev="${COMP_WORDS[COMP_CWORD-1]}"
106
+ commands="#{command_names.join(" ")}"
107
+
108
+ if [[ ${COMP_CWORD} -eq 1 ]]; then
109
+ COMPREPLY=( $(compgen -W "${commands}" -- "${cur}") )
110
+ return 0
111
+ fi
112
+
113
+ case "${COMP_WORDS[1]}" in
114
+ hook)
115
+ COMPREPLY=( $(compgen -W "ingest sweep publish context" -- "${cur}") )
116
+ ;;
117
+ compact|export|changes|stats|sweep|conflicts)
118
+ if [[ "${prev}" == "--scope" ]]; then
119
+ COMPREPLY=( $(compgen -W "all global project" -- "${cur}") )
120
+ else
121
+ COMPREPLY=( $(compgen -W "--scope" -- "${cur}") )
122
+ fi
123
+ ;;
124
+ install-skill)
125
+ if [[ "${prev}" == "install-skill" ]]; then
126
+ COMPREPLY=( $(compgen -W "#{skill_names.join(" ")} --list --force" -- "${cur}") )
127
+ fi
128
+ ;;
129
+ completion)
130
+ if [[ "${prev}" == "--shell" ]]; then
131
+ COMPREPLY=( $(compgen -W "bash zsh" -- "${cur}") )
132
+ else
133
+ COMPREPLY=( $(compgen -W "--shell" -- "${cur}") )
134
+ fi
135
+ ;;
136
+ esac
137
+ return 0
138
+ }
139
+
140
+ complete -F _claude_memory claude-memory
141
+ BASH
142
+ end
143
+
144
+ def skill_names
145
+ InstallSkillCommand::AVAILABLE_SKILLS.keys
146
+ end
147
+ end
148
+ end
149
+ end
@@ -20,6 +20,8 @@ module ClaudeMemory
20
20
  checks = [
21
21
  Checks::DatabaseCheck.new(manager.global_db_path, "global"),
22
22
  Checks::DatabaseCheck.new(manager.project_db_path, "project"),
23
+ Checks::DistillCheck.new(manager.global_db_path, "global"),
24
+ Checks::DistillCheck.new(manager.project_db_path, "project"),
23
25
  Checks::VecCheck.new,
24
26
  Checks::SnapshotCheck.new,
25
27
  Checks::ClaudeMdCheck.new,
@@ -0,0 +1,198 @@
1
+ # frozen_string_literal: true
2
+
3
+ module ClaudeMemory
4
+ module Commands
5
+ # Shows embedding configuration, lists available models, and validates setup.
6
+ #
7
+ # Subcommands:
8
+ # claude-memory embeddings # Show current config
9
+ # claude-memory embeddings list # List available models
10
+ # claude-memory embeddings check # Validate current setup
11
+ #
12
+ class EmbeddingsCommand < BaseCommand
13
+ def call(args)
14
+ opts = parse_options(args, {}) do |o|
15
+ OptionParser.new do |parser|
16
+ parser.banner = "Usage: claude-memory embeddings [list|check]"
17
+ end
18
+ end
19
+ return 1 if opts.nil?
20
+
21
+ subcommand = args.first
22
+
23
+ case subcommand
24
+ when "list" then list_models
25
+ when "check" then check_setup
26
+ when nil then show_config
27
+ else
28
+ failure("Unknown subcommand: #{subcommand}. Use: list, check")
29
+ end
30
+ end
31
+
32
+ private
33
+
34
+ def inspector
35
+ @inspector ||= Embeddings::Inspector.new
36
+ end
37
+
38
+ def show_config
39
+ provider = ENV["CLAUDE_MEMORY_EMBEDDING_PROVIDER"] || "tfidf"
40
+ model = ENV["CLAUDE_MEMORY_EMBEDDING_MODEL"]
41
+ api_url = ENV["CLAUDE_MEMORY_EMBEDDING_API_URL"]
42
+
43
+ stdout.puts "Embedding Configuration"
44
+ stdout.puts "======================"
45
+ stdout.puts "Provider: #{provider}"
46
+ stdout.puts "Model: #{model || "(default)"}"
47
+
48
+ if model
49
+ info = Embeddings::ModelRegistry.find(model)
50
+ if info
51
+ stdout.puts "Dimensions: #{info.dimensions}"
52
+ stdout.puts "Description: #{info.description}"
53
+ else
54
+ stdout.puts "Dimensions: (unknown - will be discovered at runtime)"
55
+ end
56
+ else
57
+ info = Embeddings::ModelRegistry.default_for_provider(provider)
58
+ if info
59
+ stdout.puts "Default model: #{info.name}"
60
+ stdout.puts "Dimensions: #{info.dimensions}"
61
+ end
62
+ end
63
+
64
+ stdout.puts "API URL: #{api_url}" if api_url && provider == "api"
65
+
66
+ inspector.database_states.each do |state|
67
+ stdout.puts ""
68
+ stdout.puts "#{state.label.capitalize} DB: provider=#{state.provider || "unknown"}, dimensions=#{state.dimensions || "unknown"}"
69
+ end
70
+
71
+ stdout.puts ""
72
+ stdout.puts "ENV variables:"
73
+ stdout.puts " CLAUDE_MEMORY_EMBEDDING_PROVIDER Provider (tfidf, fastembed, api)"
74
+ stdout.puts " CLAUDE_MEMORY_EMBEDDING_MODEL Model name"
75
+ stdout.puts " CLAUDE_MEMORY_EMBEDDING_API_KEY API key (for api provider)"
76
+ stdout.puts " CLAUDE_MEMORY_EMBEDDING_API_URL API endpoint (for api provider)"
77
+ 0
78
+ end
79
+
80
+ def list_models
81
+ Embeddings::ModelRegistry.providers.each do |provider|
82
+ stdout.puts ""
83
+ stdout.puts "#{provider_label(provider)}:"
84
+ stdout.puts "-" * 40
85
+
86
+ Embeddings::ModelRegistry.models_for_provider(provider).each do |model|
87
+ size = model.size_mb ? "#{model.size_mb}MB" : "cloud"
88
+ tokens = model.max_tokens ? "#{model.max_tokens} tokens" : ""
89
+ stdout.puts " #{model.name}"
90
+ stdout.puts " #{model.dimensions}-dim | #{size} | #{tokens}"
91
+ stdout.puts " #{model.description}"
92
+ end
93
+ end
94
+
95
+ stdout.puts ""
96
+ stdout.puts "Custom models: Set CLAUDE_MEMORY_EMBEDDING_MODEL to any model"
97
+ stdout.puts "supported by your provider. Dimensions are auto-detected."
98
+ 0
99
+ end
100
+
101
+ def check_setup
102
+ provider_name = ENV["CLAUDE_MEMORY_EMBEDDING_PROVIDER"] || "tfidf"
103
+ model_name = ENV["CLAUDE_MEMORY_EMBEDDING_MODEL"]
104
+
105
+ stdout.puts "Checking embedding setup..."
106
+ stdout.puts ""
107
+
108
+ ok = true
109
+ ok &= check_provider(provider_name)
110
+ ok &= check_model(provider_name, model_name) if model_name
111
+ ok &= render_dimension_checks(provider_name, model_name)
112
+
113
+ stdout.puts ""
114
+ stdout.puts ok ? "All checks passed." : "Some checks failed. See above."
115
+ ok ? 0 : 1
116
+ end
117
+
118
+ def check_provider(name)
119
+ case name
120
+ when "fastembed"
121
+ check_fastembed
122
+ when "api"
123
+ check_api_config
124
+ when "tfidf"
125
+ stdout.puts " [OK] tfidf provider (built-in, always available)"
126
+ true
127
+ else
128
+ stdout.puts " [FAIL] Unknown provider: #{name}"
129
+ false
130
+ end
131
+ end
132
+
133
+ def check_model(provider_name, model_name)
134
+ info = Embeddings::ModelRegistry.find(model_name)
135
+ if info
136
+ if info.provider != provider_name
137
+ stdout.puts " [WARN] Model '#{model_name}' is for '#{info.provider}' provider, but '#{provider_name}' is selected"
138
+ stdout.puts " Set CLAUDE_MEMORY_EMBEDDING_PROVIDER=#{info.provider}"
139
+ else
140
+ stdout.puts " [OK] Model '#{model_name}' (#{info.dimensions}-dim)"
141
+ end
142
+ else
143
+ stdout.puts " [INFO] Model '#{model_name}' not in registry (dimensions will be auto-detected)"
144
+ end
145
+ true
146
+ end
147
+
148
+ def render_dimension_checks(provider_name, model_name)
149
+ ok = true
150
+
151
+ inspector.dimension_checks(provider_name, model_name).each do |check|
152
+ case check.status
153
+ when :mismatch
154
+ stdout.puts " [WARN] #{check.label}: Dimension mismatch (stored: #{check.stored_dims}, current: #{check.current_dims})"
155
+ stdout.puts " Re-index with: claude-memory index --force --scope #{check.label}"
156
+ ok = false
157
+ when :match
158
+ stdout.puts " [OK] #{check.label}: #{check.stored_dims}-dim (provider: #{check.stored_provider || "unknown"})"
159
+ when :fresh
160
+ stdout.puts " [INFO] #{check.label}: No embeddings indexed yet"
161
+ end
162
+ end
163
+
164
+ ok
165
+ end
166
+
167
+ def check_fastembed
168
+ require "fastembed"
169
+ stdout.puts " [OK] fastembed gem available"
170
+ true
171
+ rescue LoadError
172
+ stdout.puts " [FAIL] fastembed gem not installed"
173
+ stdout.puts " Add `gem 'fastembed'` to your Gemfile"
174
+ false
175
+ end
176
+
177
+ def check_api_config
178
+ key = ENV["CLAUDE_MEMORY_EMBEDDING_API_KEY"] || ENV["OPENAI_API_KEY"]
179
+ if key
180
+ stdout.puts " [OK] API key configured"
181
+ true
182
+ else
183
+ stdout.puts " [FAIL] No API key found"
184
+ stdout.puts " Set CLAUDE_MEMORY_EMBEDDING_API_KEY or OPENAI_API_KEY"
185
+ false
186
+ end
187
+ end
188
+
189
+ def provider_label(provider)
190
+ case provider
191
+ when "fastembed" then "fastembed (local ONNX, no API key)"
192
+ when "api" then "api (OpenAI-compatible endpoints, requires API key)"
193
+ when "tfidf" then "tfidf (built-in, no dependencies)"
194
+ end
195
+ end
196
+ end
197
+ end
198
+ end
@@ -19,18 +19,29 @@ module ClaudeMemory
19
19
  explain Explain a fact with receipts
20
20
  export Export facts to JSON for backup
21
21
  help Show this help message
22
- hook Run hook entrypoints (ingest|sweep|publish)
22
+ hook Run hook entrypoints (ingest|sweep|publish|context)
23
23
  init Initialize ClaudeMemory in a project
24
24
  ingest Ingest transcript delta
25
25
  promote Promote a project fact to global memory
26
26
  publish Publish snapshot to Claude Code memory
27
27
  recall Recall facts matching a query
28
+ recover Recover stuck operations
29
+ reject Mark a fact as rejected (e.g. hallucination)
30
+ restore Restore superseded facts from reclassified predicates
28
31
  search Search indexed content
29
32
  serve-mcp Start MCP server
33
+ stats Show statistics (--tools for MCP telemetry)
30
34
  sweep Run maintenance/pruning
31
35
  uninstall Remove ClaudeMemory configuration
32
36
  version Show version number
33
37
 
38
+ Utilities:
39
+ completion Generate shell completions (bash/zsh)
40
+ embeddings Inspect embedding backend
41
+ git-lfs Git LFS integration for memory DB
42
+ index Build or rebuild content indexes
43
+ install-skill Install agent skills to ~/.claude/commands/
44
+
34
45
  Run 'claude-memory <command> --help' for more information on a command.
35
46
  HELP
36
47
  0
@@ -171,13 +171,14 @@ module ClaudeMemory
171
171
 
172
172
  def hook_context(payload, db_path)
173
173
  project_path = payload["project_path"] || payload["cwd"]
174
+ source = payload["source"]
174
175
  manager = ClaudeMemory::Store::StoreManager.new(
175
176
  project_db_path: db_path,
176
177
  project_path: project_path
177
178
  )
178
179
  manager.ensure_both!
179
180
 
180
- injector = ClaudeMemory::Hook::ContextInjector.new(manager)
181
+ injector = ClaudeMemory::Hook::ContextInjector.new(manager, source: source)
181
182
  context_text = injector.generate_context
182
183
 
183
184
  if context_text
@@ -9,13 +9,14 @@ module ClaudeMemory
9
9
  SCOPE_PROJECT = "project"
10
10
 
11
11
  def call(args)
12
- opts = parse_options(args, {scope: SCOPE_ALL, batch_size: 100, force: false, vec: false}) do |o|
12
+ opts = parse_options(args, {scope: SCOPE_ALL, batch_size: 100, force: false, vec: false, provider: nil}) do |o|
13
13
  OptionParser.new do |parser|
14
14
  parser.banner = "Usage: claude-memory index [options]"
15
15
  parser.on("--scope SCOPE", "Scope: global, project, or all (default: all)") { |v| o[:scope] = v }
16
16
  parser.on("--batch-size SIZE", Integer, "Batch size (default: 100)") { |v| o[:batch_size] = v }
17
17
  parser.on("--force", "Re-index facts that already have embeddings") { o[:force] = true }
18
18
  parser.on("--vec", "Backfill vec0 index from existing embeddings (no regeneration)") { o[:vec] = true }
19
+ parser.on("--provider NAME", "Embedding provider: tfidf, fastembed, api") { |v| o[:provider] = v }
19
20
  end
20
21
  end
21
22
  return 1 if opts.nil?
@@ -30,7 +31,7 @@ module ClaudeMemory
30
31
  return vec_backfill(opts)
31
32
  end
32
33
 
33
- generator = Embeddings::Generator.new
34
+ generator = Embeddings.resolve(opts[:provider])
34
35
 
35
36
  scopes_for(opts[:scope]).each do |label, db_path|
36
37
  index_database(label, db_path, generator, opts)
@@ -42,9 +43,10 @@ module ClaudeMemory
42
43
  private
43
44
 
44
45
  def scopes_for(scope)
46
+ config = Configuration.new
45
47
  pairs = []
46
- pairs << ["global", Configuration.global_db_path] if scope == SCOPE_ALL || scope == SCOPE_GLOBAL
47
- pairs << ["project", Configuration.project_db_path] if scope == SCOPE_ALL || scope == SCOPE_PROJECT
48
+ pairs << ["global", config.global_db_path] if scope == SCOPE_ALL || scope == SCOPE_GLOBAL
49
+ pairs << ["project", config.project_db_path] if scope == SCOPE_ALL || scope == SCOPE_PROJECT
48
50
  pairs
49
51
  end
50
52
 
@@ -55,111 +57,85 @@ module ClaudeMemory
55
57
  end
56
58
 
57
59
  store = Store::SQLiteStore.new(db_path)
60
+ handle_dimension_mismatch(store, generator, label)
58
61
  tracker = Infrastructure::OperationTracker.new(store)
59
62
 
60
- # Check for existing progress (resumption support)
63
+ facts, checkpoint = find_facts_to_index(store, tracker, label, opts)
64
+ unless facts
65
+ store.close
66
+ return
67
+ end
68
+
69
+ operation_id = checkpoint ? checkpoint[:operation_id] : tracker.start_operation(
70
+ operation_type: "index_embeddings",
71
+ scope: label,
72
+ total_items: facts.size,
73
+ checkpoint_data: {last_fact_id: nil}
74
+ )
75
+
76
+ stdout.puts "#{label.capitalize} database: Indexing #{facts.size} facts..."
77
+ run_indexing(store, facts, generator, tracker, operation_id, checkpoint, opts)
78
+ end
79
+
80
+ def handle_dimension_mismatch(store, generator, label)
81
+ check = Embeddings::DimensionCheck.call(store, generator)
82
+ return unless check.status == :mismatch
83
+
84
+ stdout.puts "#{label.capitalize}: Embedding dimensions changed (#{check.stored} → #{check.current}), clearing stale embeddings..."
85
+ clear_stale_embeddings(store)
86
+ end
87
+
88
+ def find_facts_to_index(store, tracker, label, opts)
61
89
  checkpoint = tracker.get_checkpoint(operation_type: "index_embeddings", scope: label)
90
+
62
91
  if checkpoint && !opts[:force]
63
92
  stdout.puts "#{label.capitalize} database: Resuming from previous run (processed #{checkpoint[:processed_items]} facts)..."
64
93
  resume_from_fact_id = checkpoint[:checkpoint_data][:last_fact_id]
65
- else
66
- resume_from_fact_id = nil
67
- end
68
-
69
- # Find facts to index
70
- facts_dataset = if opts[:force]
71
- store.facts
72
- else
73
- store.facts.where(embedding_json: nil)
74
- end
75
-
76
- # If resuming, skip facts we've already processed
77
- if resume_from_fact_id
78
- facts_dataset = facts_dataset.where(Sequel.lit("id > ?", resume_from_fact_id))
79
94
  end
80
95
 
96
+ facts_dataset = opts[:force] ? store.facts : store.facts.where(embedding_json: nil)
97
+ facts_dataset = facts_dataset.where(Sequel.lit("id > ?", resume_from_fact_id)) if resume_from_fact_id
81
98
  facts = facts_dataset.order(:id).all
82
99
 
83
100
  if facts.empty? && !checkpoint
84
101
  stdout.puts "#{label.capitalize} database: All facts already indexed"
85
- store.close
86
- return
102
+ return nil
87
103
  elsif facts.empty? && checkpoint
88
- # Resume found nothing left to do - mark as completed
89
104
  tracker.complete_operation(checkpoint[:operation_id])
90
105
  stdout.puts "#{label.capitalize} database: Resumed operation completed (nothing left to index)"
91
- store.close
92
- return
106
+ return nil
93
107
  end
94
108
 
95
- # Start or continue operation tracking
96
- operation_id = checkpoint ? checkpoint[:operation_id] : tracker.start_operation(
97
- operation_type: "index_embeddings",
98
- scope: label,
99
- total_items: facts.size,
100
- checkpoint_data: {last_fact_id: nil}
101
- )
102
-
103
- stdout.puts "#{label.capitalize} database: Indexing #{facts.size} facts..."
109
+ [facts, checkpoint]
110
+ end
104
111
 
112
+ def run_indexing(store, facts, generator, tracker, operation_id, checkpoint, opts)
105
113
  vec_index = store.vector_index
106
- if vec_index.available?
107
- stdout.puts " sqlite-vec available, dual-writing to vec0 index"
108
- end
114
+ stdout.puts " sqlite-vec available, dual-writing to vec0 index" if vec_index.available?
109
115
 
110
- # Build embedding cache from already-embedded facts for content-addressed dedup
111
116
  embedding_cache = build_embedding_cache(store)
112
117
  cache_hits = 0
113
-
114
118
  processed = checkpoint ? checkpoint[:processed_items] : 0
119
+
115
120
  begin
116
121
  facts.each_slice(opts[:batch_size]) do |batch|
117
- # Wrap batch processing in transaction for atomicity
118
- store.db.transaction do
119
- batch.each do |fact|
120
- # Generate text representation
121
- text = build_fact_text(fact, store)
122
-
123
- # Content-addressed dedup: reuse embedding if identical text exists
124
- embedding = embedding_cache[text]
125
- if embedding
126
- cache_hits += 1
127
- else
128
- embedding = generator.generate(text)
129
- embedding_cache[text] = embedding
130
- end
131
-
132
- # Store embedding (JSON column)
133
- store.update_fact_embedding(fact[:id], embedding)
134
-
135
- # Dual-write to vec0 if available (insert_embedding manages vec_indexed_at)
136
- vec_index.insert_embedding(fact[:id], embedding) if vec_index.available?
137
-
138
- processed += 1
139
- end
140
-
141
- # Update checkpoint after batch commits
142
- last_fact_id = batch.last[:id]
143
- tracker.update_progress(
144
- operation_id,
145
- processed_items: processed,
146
- checkpoint_data: {last_fact_id: last_fact_id}
147
- )
148
- end
149
-
122
+ cache_hits += process_batch(store, batch, generator, vec_index, embedding_cache)
123
+ processed += batch.size
124
+
125
+ tracker.update_progress(
126
+ operation_id,
127
+ processed_items: processed,
128
+ checkpoint_data: {last_fact_id: batch.last[:id]}
129
+ )
150
130
  stdout.puts " Processed #{processed} facts..."
151
131
  end
152
132
 
153
- if processed > 0
154
- pct = (cache_hits > 0) ? "#{(cache_hits * 100.0 / processed).round(1)}%" : "0%"
155
- stdout.puts " Cache hits: #{cache_hits}/#{processed} (#{pct} dedup)"
156
- end
157
-
158
- # Mark operation as completed
133
+ report_dedup_stats(processed, cache_hits)
134
+ store.set_meta("embedding_dimensions", generator.dimensions.to_s)
135
+ store.set_meta("embedding_provider", generator.name)
159
136
  tracker.complete_operation(operation_id)
160
137
  stdout.puts " Done!"
161
138
  rescue => e
162
- # Mark operation as failed
163
139
  tracker.fail_operation(operation_id, e.message)
164
140
  stderr.puts " Failed: #{e.message}"
165
141
  raise
@@ -168,6 +144,32 @@ module ClaudeMemory
168
144
  end
169
145
  end
170
146
 
147
+ def process_batch(store, batch, generator, vec_index, embedding_cache)
148
+ cache_hits = 0
149
+ store.db.transaction do
150
+ batch.each do |fact|
151
+ text = build_fact_text(fact, store)
152
+ embedding = embedding_cache[text]
153
+ if embedding
154
+ cache_hits += 1
155
+ else
156
+ embedding = generator.generate(text)
157
+ embedding_cache[text] = embedding
158
+ end
159
+ store.update_fact_embedding(fact[:id], embedding)
160
+ vec_index.insert_embedding(fact[:id], embedding) if vec_index.available?
161
+ end
162
+ end
163
+ cache_hits
164
+ end
165
+
166
+ def report_dedup_stats(processed, cache_hits)
167
+ return unless processed > 0
168
+
169
+ pct = (cache_hits > 0) ? "#{(cache_hits * 100.0 / processed).round(1)}%" : "0%"
170
+ stdout.puts " Cache hits: #{cache_hits}/#{processed} (#{pct} dedup)"
171
+ end
172
+
171
173
  def vec_backfill(opts)
172
174
  scopes_for(opts[:scope]).each do |label, db_path|
173
175
  unless File.exist?(db_path)
@@ -244,6 +246,11 @@ module ClaudeMemory
244
246
  parts.join(" ")
245
247
  end
246
248
 
249
+ def clear_stale_embeddings(store)
250
+ store.facts.where(Sequel.~(embedding_json: nil)).update(embedding_json: nil, vec_indexed_at: nil)
251
+ store.vector_index.clear!
252
+ end
253
+
247
254
  def valid_scope?(scope)
248
255
  [SCOPE_ALL, SCOPE_GLOBAL, SCOPE_PROJECT].include?(scope)
249
256
  end
@@ -15,6 +15,10 @@ module ClaudeMemory
15
15
  @stdout.puts "✓ Global database: #{manager.global_db_path}"
16
16
  manager.ensure_project!
17
17
  @stdout.puts "✓ Project database: #{manager.project_db_path}"
18
+
19
+ backfill_distillation_metrics(manager.global_store, "global")
20
+ backfill_distillation_metrics(manager.project_store, "project")
21
+
18
22
  manager.close
19
23
  end
20
24
 
@@ -22,8 +26,20 @@ module ClaudeMemory
22
26
  manager = ClaudeMemory::Store::StoreManager.new
23
27
  manager.ensure_global!
24
28
  @stdout.puts "✓ Created global database: #{manager.global_db_path}"
29
+
30
+ backfill_distillation_metrics(manager.global_store, "global")
31
+
25
32
  manager.close
26
33
  end
34
+
35
+ private
36
+
37
+ def backfill_distillation_metrics(store, label)
38
+ backfilled = store.backfill_distillation_metrics!
39
+ if backfilled > 0
40
+ @stdout.puts "✓ Marked #{backfilled} pre-existing content items as distilled (#{label})"
41
+ end
42
+ end
27
43
  end
28
44
  end
29
45
  end