claude_memory 0.2.0 → 0.3.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (104) hide show
  1. checksums.yaml +4 -4
  2. data/.claude/.mind.mv2.o2N83S +0 -0
  3. data/.claude/CLAUDE.md +1 -0
  4. data/.claude/rules/claude_memory.generated.md +28 -9
  5. data/.claude/settings.local.json +9 -1
  6. data/.claude/skills/check-memory/SKILL.md +77 -0
  7. data/.claude/skills/improve/SKILL.md +532 -0
  8. data/.claude/skills/improve/feature-patterns.md +1221 -0
  9. data/.claude/skills/quality-update/SKILL.md +229 -0
  10. data/.claude/skills/quality-update/implementation-guide.md +346 -0
  11. data/.claude/skills/review-commit/SKILL.md +199 -0
  12. data/.claude/skills/review-for-quality/SKILL.md +154 -0
  13. data/.claude/skills/review-for-quality/expert-checklists.md +79 -0
  14. data/.claude/skills/setup-memory/SKILL.md +168 -0
  15. data/.claude/skills/study-repo/SKILL.md +307 -0
  16. data/.claude/skills/study-repo/analysis-template.md +323 -0
  17. data/.claude/skills/study-repo/focus-examples.md +327 -0
  18. data/CHANGELOG.md +133 -0
  19. data/CLAUDE.md +130 -11
  20. data/README.md +117 -10
  21. data/db/migrations/001_create_initial_schema.rb +117 -0
  22. data/db/migrations/002_add_project_scoping.rb +33 -0
  23. data/db/migrations/003_add_session_metadata.rb +42 -0
  24. data/db/migrations/004_add_fact_embeddings.rb +20 -0
  25. data/db/migrations/005_add_incremental_sync.rb +21 -0
  26. data/db/migrations/006_add_operation_tracking.rb +40 -0
  27. data/db/migrations/007_add_ingestion_metrics.rb +26 -0
  28. data/docs/.claude/mind.mv2.lock +0 -0
  29. data/docs/GETTING_STARTED.md +587 -0
  30. data/docs/RELEASE_NOTES_v0.2.0.md +0 -1
  31. data/docs/RUBY_COMMUNITY_POST_v0.2.0.md +0 -2
  32. data/docs/architecture.md +9 -8
  33. data/docs/auto_init_design.md +230 -0
  34. data/docs/improvements.md +557 -731
  35. data/docs/influence/.gitkeep +13 -0
  36. data/docs/influence/grepai.md +933 -0
  37. data/docs/influence/qmd.md +2195 -0
  38. data/docs/plugin.md +257 -11
  39. data/docs/quality_review.md +472 -1273
  40. data/docs/remaining_improvements.md +330 -0
  41. data/lefthook.yml +13 -0
  42. data/lib/claude_memory/commands/checks/claude_md_check.rb +41 -0
  43. data/lib/claude_memory/commands/checks/database_check.rb +120 -0
  44. data/lib/claude_memory/commands/checks/hooks_check.rb +112 -0
  45. data/lib/claude_memory/commands/checks/reporter.rb +110 -0
  46. data/lib/claude_memory/commands/checks/snapshot_check.rb +30 -0
  47. data/lib/claude_memory/commands/doctor_command.rb +12 -129
  48. data/lib/claude_memory/commands/help_command.rb +1 -0
  49. data/lib/claude_memory/commands/hook_command.rb +9 -2
  50. data/lib/claude_memory/commands/index_command.rb +169 -0
  51. data/lib/claude_memory/commands/ingest_command.rb +1 -1
  52. data/lib/claude_memory/commands/init_command.rb +5 -197
  53. data/lib/claude_memory/commands/initializers/database_ensurer.rb +30 -0
  54. data/lib/claude_memory/commands/initializers/global_initializer.rb +85 -0
  55. data/lib/claude_memory/commands/initializers/hooks_configurator.rb +156 -0
  56. data/lib/claude_memory/commands/initializers/mcp_configurator.rb +56 -0
  57. data/lib/claude_memory/commands/initializers/memory_instructions_writer.rb +135 -0
  58. data/lib/claude_memory/commands/initializers/project_initializer.rb +111 -0
  59. data/lib/claude_memory/commands/recover_command.rb +75 -0
  60. data/lib/claude_memory/commands/registry.rb +5 -1
  61. data/lib/claude_memory/commands/stats_command.rb +239 -0
  62. data/lib/claude_memory/commands/uninstall_command.rb +226 -0
  63. data/lib/claude_memory/core/batch_loader.rb +32 -0
  64. data/lib/claude_memory/core/concept_ranker.rb +73 -0
  65. data/lib/claude_memory/core/embedding_candidate_builder.rb +37 -0
  66. data/lib/claude_memory/core/fact_collector.rb +51 -0
  67. data/lib/claude_memory/core/fact_query_builder.rb +154 -0
  68. data/lib/claude_memory/core/fact_ranker.rb +113 -0
  69. data/lib/claude_memory/core/result_builder.rb +54 -0
  70. data/lib/claude_memory/core/result_sorter.rb +25 -0
  71. data/lib/claude_memory/core/scope_filter.rb +61 -0
  72. data/lib/claude_memory/core/text_builder.rb +29 -0
  73. data/lib/claude_memory/embeddings/generator.rb +161 -0
  74. data/lib/claude_memory/embeddings/similarity.rb +69 -0
  75. data/lib/claude_memory/hook/handler.rb +4 -3
  76. data/lib/claude_memory/index/lexical_fts.rb +7 -2
  77. data/lib/claude_memory/infrastructure/operation_tracker.rb +158 -0
  78. data/lib/claude_memory/infrastructure/schema_validator.rb +206 -0
  79. data/lib/claude_memory/ingest/content_sanitizer.rb +6 -7
  80. data/lib/claude_memory/ingest/ingester.rb +99 -15
  81. data/lib/claude_memory/ingest/metadata_extractor.rb +57 -0
  82. data/lib/claude_memory/ingest/tool_extractor.rb +71 -0
  83. data/lib/claude_memory/mcp/response_formatter.rb +331 -0
  84. data/lib/claude_memory/mcp/server.rb +19 -0
  85. data/lib/claude_memory/mcp/setup_status_analyzer.rb +73 -0
  86. data/lib/claude_memory/mcp/tool_definitions.rb +279 -0
  87. data/lib/claude_memory/mcp/tool_helpers.rb +80 -0
  88. data/lib/claude_memory/mcp/tools.rb +330 -320
  89. data/lib/claude_memory/recall/dual_query_template.rb +63 -0
  90. data/lib/claude_memory/recall.rb +304 -237
  91. data/lib/claude_memory/resolve/resolver.rb +52 -49
  92. data/lib/claude_memory/store/sqlite_store.rb +210 -144
  93. data/lib/claude_memory/store/store_manager.rb +6 -6
  94. data/lib/claude_memory/sweep/sweeper.rb +6 -0
  95. data/lib/claude_memory/version.rb +1 -1
  96. data/lib/claude_memory.rb +35 -3
  97. metadata +71 -11
  98. data/.claude/.mind.mv2.aLCUZd +0 -0
  99. data/.claude/memory.sqlite3 +0 -0
  100. data/.mcp.json +0 -11
  101. /data/docs/{feature_adoption_plan.md → plans/feature_adoption_plan.md} +0 -0
  102. /data/docs/{feature_adoption_plan_revised.md → plans/feature_adoption_plan_revised.md} +0 -0
  103. /data/docs/{plan.md → plans/plan.md} +0 -0
  104. /data/docs/{updated_plan.md → plans/updated_plan.md} +0 -0
@@ -0,0 +1,161 @@
1
+ # frozen_string_literal: true
2
+
3
+ require "digest"
4
+
5
+ module ClaudeMemory
6
+ module Embeddings
7
+ # Lightweight embedding generator using TF-IDF approach
8
+ # Generates normalized 384-dimensional vectors for semantic similarity
9
+ #
10
+ # This is a pragmatic implementation that works without heavy dependencies.
11
+ # Future: Can be upgraded to transformer-based models (sentence-transformers)
12
+ class Generator
13
+ EMBEDDING_DIM = 384
14
+
15
+ # Common technical terms and programming concepts for vocabulary
16
+ VOCABULARY = %w[
17
+ database framework library module class function method
18
+ api rest graphql http request response server client
19
+ authentication authorization token session cookie jwt
20
+ user admin role permission access control security
21
+ error exception handling validation sanitization
22
+ test spec unit integration end-to-end e2e
23
+ frontend backend fullstack ui ux component
24
+ react vue angular svelte javascript typescript
25
+ ruby python java go rust php elixir
26
+ sql nosql postgresql mysql mongodb redis sqlite
27
+ docker kubernetes container orchestration deployment
28
+ git branch commit merge pull push repository
29
+ configuration environment variable setting preference
30
+ logger logging debug trace info warn error
31
+ cache caching storage persistence state
32
+ async await promise callback thread process
33
+ route routing middleware handler controller
34
+ model view template render component
35
+ form input button submit validation
36
+ dependency injection service factory singleton
37
+ migration schema table column index constraint
38
+ query filter sort pagination limit offset
39
+ create read update delete crud operation
40
+ json xml yaml csv format serialization
41
+ encrypt decrypt hash salt cipher algorithm
42
+ webhook event listener subscriber publisher
43
+ job queue worker background task schedule
44
+ metric monitoring performance optimization
45
+ refactor cleanup technical debt improvement
46
+ ].freeze
47
+
48
+ def initialize
49
+ @vocabulary_index = VOCABULARY.each_with_index.to_h
50
+ @idf_weights = compute_idf_weights
51
+ end
52
+
53
+ # Generate embedding vector for text
54
+ # @param text [String] input text to embed
55
+ # @return [Array<Float>] normalized 384-dimensional vector
56
+ def generate(text)
57
+ return zero_vector if text.nil? || text.empty?
58
+
59
+ # Tokenize and compute TF-IDF
60
+ tokens = tokenize(text.downcase)
61
+ return zero_vector if tokens.empty?
62
+
63
+ # Build term frequency map
64
+ tf_map = tokens.each_with_object(Hash.new(0)) { |token, h| h[token] += 1 }
65
+
66
+ # Normalize term frequencies
67
+ max_tf = tf_map.values.max.to_f
68
+ tf_map.transform_values! { |count| count / max_tf }
69
+
70
+ # Compute TF-IDF vector
71
+ vector = Array.new(VOCABULARY.size, 0.0)
72
+ tf_map.each do |term, tf|
73
+ idx = @vocabulary_index[term]
74
+ next unless idx
75
+
76
+ idf = @idf_weights[term] || 1.0
77
+ vector[idx] = tf * idf
78
+ end
79
+
80
+ # Add positional encoding to capture word order (simple hash-based)
81
+ positional_features = compute_positional_features(tokens)
82
+
83
+ # Combine vocabulary vector with positional features
84
+ combined = vector + positional_features
85
+
86
+ # Pad or truncate to EMBEDDING_DIM
87
+ final_vector = if combined.size > EMBEDDING_DIM
88
+ combined[0...EMBEDDING_DIM]
89
+ else
90
+ combined + Array.new(EMBEDDING_DIM - combined.size, 0.0)
91
+ end
92
+
93
+ # Normalize to unit length for cosine similarity
94
+ normalize(final_vector)
95
+ end
96
+
97
+ private
98
+
99
+ def tokenize(text)
100
+ # Simple tokenization: split on non-word characters
101
+ text.scan(/\w+/)
102
+ end
103
+
104
+ def compute_idf_weights
105
+ # Assign higher weights to more specific technical terms
106
+ # General terms get lower weights
107
+ weights = {}
108
+
109
+ # Very common terms (lower weight)
110
+ common = %w[the is are was were be been being have has had do does did
111
+ for with from that this these those can could would should
112
+ will make get set add remove update delete create]
113
+ common.each { |term| weights[term] = 0.5 }
114
+
115
+ # Technical terms (higher weight)
116
+ VOCABULARY.each { |term| weights[term] ||= 2.0 }
117
+
118
+ weights
119
+ end
120
+
121
+ def compute_positional_features(tokens)
122
+ # Capture word order and bi-grams using simple hashing
123
+ features_dim = EMBEDDING_DIM - VOCABULARY.size
124
+ features = Array.new(features_dim, 0.0)
125
+
126
+ # Unigram features
127
+ tokens.each_with_index do |token, i|
128
+ hash = Digest::MD5.hexdigest("#{token}_#{i % 10}").to_i(16)
129
+ idx = hash % features_dim
130
+ features[idx] += 1.0
131
+ end
132
+
133
+ # Bigram features
134
+ tokens.each_cons(2) do |token1, token2|
135
+ bigram = "#{token1}_#{token2}"
136
+ hash = Digest::MD5.hexdigest(bigram).to_i(16)
137
+ idx = hash % features_dim
138
+ features[idx] += 0.5
139
+ end
140
+
141
+ # Normalize positional features
142
+ max_val = features.max
143
+ features.map! { |v| (max_val > 0) ? v / max_val : 0.0 } if max_val
144
+
145
+ features
146
+ end
147
+
148
+ def normalize(vector)
149
+ # Normalize to unit length
150
+ magnitude = Math.sqrt(vector.sum { |v| v * v })
151
+ return vector if magnitude.zero?
152
+
153
+ vector.map { |v| v / magnitude }
154
+ end
155
+
156
+ def zero_vector
157
+ Array.new(EMBEDDING_DIM, 0.0)
158
+ end
159
+ end
160
+ end
161
+ end
@@ -0,0 +1,69 @@
1
+ # frozen_string_literal: true
2
+
3
+ module ClaudeMemory
4
+ module Embeddings
5
+ # Calculates similarity between embedding vectors
6
+ # Uses cosine similarity for comparing normalized vectors
7
+ class Similarity
8
+ # Calculate cosine similarity between two vectors
9
+ # Assumes vectors are already normalized to unit length
10
+ # @param vec_a [Array<Float>] first vector
11
+ # @param vec_b [Array<Float>] second vector
12
+ # @return [Float] similarity score between 0 and 1
13
+ def self.cosine(vec_a, vec_b)
14
+ return 0.0 if vec_a.nil? || vec_b.nil?
15
+ return 0.0 if vec_a.empty? || vec_b.empty?
16
+
17
+ # For normalized vectors, cosine similarity is just the dot product
18
+ dot_product = vec_a.zip(vec_b).sum { |a, b| a * b }
19
+
20
+ # Clamp to [0, 1] range (handle floating point errors)
21
+ dot_product.clamp(0.0, 1.0)
22
+ end
23
+
24
+ # Find top K most similar items
25
+ # @param query_vector [Array<Float>] query embedding
26
+ # @param candidates [Array<Hash>] array of hashes with :embedding key
27
+ # @param k [Integer] number of top results to return
28
+ # @return [Array<Hash>] top K candidates with :similarity scores
29
+ def self.top_k(query_vector, candidates, k)
30
+ return [] if candidates.empty?
31
+
32
+ # Calculate similarities and score
33
+ scored = candidates.map do |candidate|
34
+ embedding = candidate[:embedding]
35
+ similarity = cosine(query_vector, embedding)
36
+
37
+ {
38
+ candidate: candidate,
39
+ similarity: similarity
40
+ }
41
+ end
42
+
43
+ # Sort by similarity (highest first) and take top K
44
+ scored.sort_by { |item| -item[:similarity] }.take(k)
45
+ end
46
+
47
+ # Calculate average similarity of a vector to multiple other vectors
48
+ # Useful for multi-concept queries
49
+ # @param query_vector [Array<Float>] query embedding
50
+ # @param target_vectors [Array<Array<Float>>] target embeddings
51
+ # @return [Float] average similarity
52
+ def self.average_similarity(query_vector, target_vectors)
53
+ return 0.0 if target_vectors.empty?
54
+
55
+ similarities = target_vectors.map { |vec| cosine(query_vector, vec) }
56
+ similarities.sum / similarities.size.to_f
57
+ end
58
+
59
+ # Batch calculate similarities between one query and many candidates
60
+ # More efficient than calling cosine repeatedly
61
+ # @param query_vector [Array<Float>] query embedding
62
+ # @param candidate_vectors [Array<Array<Float>>] candidate embeddings
63
+ # @return [Array<Float>] similarity scores in same order as candidates
64
+ def self.batch_similarities(query_vector, candidate_vectors)
65
+ candidate_vectors.map { |vec| cosine(query_vector, vec) }
66
+ end
67
+ end
68
+ end
69
+ end
@@ -9,13 +9,14 @@ module ClaudeMemory
9
9
 
10
10
  def initialize(store, env: ENV)
11
11
  @store = store
12
+ @config = Configuration.new(env)
12
13
  @env = env
13
14
  end
14
15
 
15
16
  def ingest(payload)
16
- session_id = payload["session_id"] || @env["CLAUDE_SESSION_ID"]
17
- transcript_path = payload["transcript_path"] || @env["CLAUDE_TRANSCRIPT_PATH"]
18
- project_path = payload["project_path"] || @env["CLAUDE_PROJECT_DIR"] || Dir.pwd
17
+ session_id = payload["session_id"] || @config.session_id
18
+ transcript_path = payload["transcript_path"] || @config.transcript_path
19
+ project_path = payload["project_path"] || @config.project_dir
19
20
 
20
21
  raise PayloadError, "Missing required field: session_id" if session_id.nil? || session_id.empty?
21
22
  raise PayloadError, "Missing required field: transcript_path" if transcript_path.nil? || transcript_path.empty?
@@ -6,10 +6,11 @@ module ClaudeMemory
6
6
  def initialize(store)
7
7
  @store = store
8
8
  @db = store.db
9
- ensure_fts_table!
9
+ @fts_table_ensured = false
10
10
  end
11
11
 
12
12
  def index_content_item(content_item_id, text)
13
+ ensure_fts_table!
13
14
  existing = @db[:content_fts].where(content_item_id: content_item_id).get(:content_item_id)
14
15
  return if existing
15
16
 
@@ -17,6 +18,7 @@ module ClaudeMemory
17
18
  end
18
19
 
19
20
  def search(query, limit: 20)
21
+ ensure_fts_table!
20
22
  return [] if query.nil? || query.strip.empty?
21
23
 
22
24
  if query.strip == "*"
@@ -48,10 +50,13 @@ module ClaudeMemory
48
50
  private
49
51
 
50
52
  def ensure_fts_table!
53
+ return if @fts_table_ensured
54
+
51
55
  @db.run(<<~SQL)
52
- CREATE VIRTUAL TABLE IF NOT EXISTS content_fts
56
+ CREATE VIRTUAL TABLE IF NOT EXISTS content_fts
53
57
  USING fts5(content_item_id UNINDEXED, text, tokenize='porter unicode61')
54
58
  SQL
59
+ @fts_table_ensured = true
55
60
  end
56
61
  end
57
62
  end
@@ -0,0 +1,158 @@
1
+ # frozen_string_literal: true
2
+
3
+ module ClaudeMemory
4
+ module Infrastructure
5
+ # Tracks long-running operations with checkpoints for resumability
6
+ # Enables detection of stuck operations and provides recovery mechanisms
7
+ class OperationTracker
8
+ STALE_THRESHOLD_SECONDS = 86400 # 24 hours
9
+
10
+ def initialize(store)
11
+ @store = store
12
+ end
13
+
14
+ # Start tracking a new operation
15
+ # Returns operation_id
16
+ def start_operation(operation_type:, scope:, total_items: nil, checkpoint_data: {})
17
+ now = Time.now.utc.iso8601
18
+
19
+ # Mark any stale operations as failed before starting new one
20
+ cleanup_stale_operations!(operation_type, scope)
21
+
22
+ @store.db[:operation_progress].insert(
23
+ operation_type: operation_type,
24
+ scope: scope,
25
+ status: "running",
26
+ total_items: total_items,
27
+ processed_items: 0,
28
+ checkpoint_data: checkpoint_data.to_json,
29
+ started_at: now,
30
+ completed_at: nil
31
+ )
32
+ end
33
+
34
+ # Update progress with new checkpoint data
35
+ def update_progress(operation_id, processed_items:, checkpoint_data: nil)
36
+ updates = {processed_items: processed_items}
37
+ updates[:checkpoint_data] = checkpoint_data.to_json if checkpoint_data
38
+ @store.db[:operation_progress].where(id: operation_id).update(updates)
39
+ end
40
+
41
+ # Mark operation as completed
42
+ def complete_operation(operation_id)
43
+ now = Time.now.utc.iso8601
44
+ @store.db[:operation_progress].where(id: operation_id).update(
45
+ status: "completed",
46
+ completed_at: now
47
+ )
48
+ end
49
+
50
+ # Mark operation as failed with error message
51
+ def fail_operation(operation_id, error_message)
52
+ now = Time.now.utc.iso8601
53
+ checkpoint_data = @store.db[:operation_progress].where(id: operation_id).get(:checkpoint_data)
54
+ checkpoint = checkpoint_data ? JSON.parse(checkpoint_data) : {}
55
+ checkpoint[:error] = error_message
56
+
57
+ @store.db[:operation_progress].where(id: operation_id).update(
58
+ status: "failed",
59
+ completed_at: now,
60
+ checkpoint_data: checkpoint.to_json
61
+ )
62
+ end
63
+
64
+ # Get checkpoint data for resuming operation
65
+ # Returns {operation_id:, checkpoint_data:, processed_items:} or nil
66
+ # Only returns non-stale operations (< 24h old)
67
+ def get_checkpoint(operation_type:, scope:)
68
+ threshold_time = (Time.now.utc - STALE_THRESHOLD_SECONDS).iso8601
69
+
70
+ op = @store.db[:operation_progress]
71
+ .where(operation_type: operation_type, scope: scope, status: "running")
72
+ .where { started_at >= threshold_time } # Exclude stale operations
73
+ .order(Sequel.desc(:started_at))
74
+ .first
75
+
76
+ return nil unless op
77
+
78
+ checkpoint_data = op[:checkpoint_data] ? JSON.parse(op[:checkpoint_data], symbolize_names: true) : {}
79
+ {
80
+ operation_id: op[:id],
81
+ checkpoint_data: checkpoint_data,
82
+ processed_items: op[:processed_items] || 0,
83
+ total_items: op[:total_items],
84
+ started_at: op[:started_at]
85
+ }
86
+ end
87
+
88
+ # Get all stuck operations (running for > 24h)
89
+ def stuck_operations
90
+ threshold_time = (Time.now.utc - STALE_THRESHOLD_SECONDS).iso8601
91
+
92
+ @store.db[:operation_progress]
93
+ .where(status: "running")
94
+ .where { started_at < threshold_time }
95
+ .all
96
+ end
97
+
98
+ # Reset stuck operations to failed status
99
+ def reset_stuck_operations(operation_type: nil, scope: nil)
100
+ dataset = @store.db[:operation_progress].where(status: "running")
101
+ dataset = dataset.where(operation_type: operation_type) if operation_type
102
+ dataset = dataset.where(scope: scope) if scope
103
+
104
+ threshold_time = (Time.now.utc - STALE_THRESHOLD_SECONDS).iso8601
105
+ stuck = dataset.where { started_at < threshold_time }
106
+
107
+ count = stuck.count
108
+ return 0 if count.zero?
109
+
110
+ now = Time.now.utc.iso8601
111
+ error_message = "Reset by recover command - operation exceeded 24h timeout"
112
+
113
+ # Fetch each stuck operation, update checkpoint in Ruby, then save
114
+ stuck.all.each do |op|
115
+ checkpoint = op[:checkpoint_data] ? JSON.parse(op[:checkpoint_data]) : {}
116
+ checkpoint["error"] = error_message
117
+
118
+ @store.db[:operation_progress]
119
+ .where(id: op[:id])
120
+ .update(
121
+ status: "failed",
122
+ completed_at: now,
123
+ checkpoint_data: JSON.generate(checkpoint)
124
+ )
125
+ end
126
+
127
+ count
128
+ end
129
+
130
+ private
131
+
132
+ # Mark stale operations as failed before starting new operation
133
+ def cleanup_stale_operations!(operation_type, scope)
134
+ threshold_time = (Time.now.utc - STALE_THRESHOLD_SECONDS).iso8601
135
+ now = Time.now.utc.iso8601
136
+ error_message = "Automatically marked as failed - operation exceeded 24h timeout"
137
+
138
+ stale = @store.db[:operation_progress]
139
+ .where(operation_type: operation_type, scope: scope, status: "running")
140
+ .where { started_at < threshold_time }
141
+
142
+ # Fetch each stale operation, update checkpoint in Ruby, then save
143
+ stale.all.each do |op|
144
+ checkpoint = op[:checkpoint_data] ? JSON.parse(op[:checkpoint_data]) : {}
145
+ checkpoint["error"] = error_message
146
+
147
+ @store.db[:operation_progress]
148
+ .where(id: op[:id])
149
+ .update(
150
+ status: "failed",
151
+ completed_at: now,
152
+ checkpoint_data: JSON.generate(checkpoint)
153
+ )
154
+ end
155
+ end
156
+ end
157
+ end
158
+ end
@@ -0,0 +1,206 @@
1
+ # frozen_string_literal: true
2
+
3
+ module ClaudeMemory
4
+ module Infrastructure
5
+ # Validates database schema integrity and data consistency
6
+ # Records validation results in schema_health table
7
+ class SchemaValidator
8
+ EXPECTED_TABLES = %i[
9
+ meta content_items delta_cursors entities entity_aliases facts
10
+ provenance fact_links conflicts tool_calls
11
+ operation_progress schema_health
12
+ ].freeze
13
+
14
+ # FTS table is created lazily, so it's optional
15
+ OPTIONAL_TABLES = %i[content_fts].freeze
16
+
17
+ CRITICAL_COLUMNS = {
18
+ facts: %i[id subject_entity_id predicate status scope project_path embedding_json],
19
+ content_items: %i[id source session_id text_hash ingested_at source_mtime],
20
+ entities: %i[id type canonical_name slug],
21
+ operation_progress: %i[id operation_type scope status started_at]
22
+ }.freeze
23
+
24
+ CRITICAL_INDEXES = %i[
25
+ idx_facts_predicate idx_facts_subject idx_facts_status idx_facts_scope
26
+ idx_facts_project idx_provenance_fact idx_content_items_session
27
+ idx_operation_progress_type idx_operation_progress_status
28
+ ].freeze
29
+
30
+ def initialize(store)
31
+ @store = store
32
+ end
33
+
34
+ def validate
35
+ issues = []
36
+
37
+ # Check tables exist
38
+ tables = @store.db.tables
39
+ missing_tables = EXPECTED_TABLES - tables
40
+ missing_tables.each do |table|
41
+ issues << {severity: "error", message: "Missing table: #{table}"}
42
+ end
43
+
44
+ # Check critical columns exist
45
+ CRITICAL_COLUMNS.each do |table, columns|
46
+ next unless tables.include?(table)
47
+
48
+ existing_columns = @store.db.schema(table).map(&:first)
49
+ missing_columns = columns - existing_columns
50
+ missing_columns.each do |column|
51
+ issues << {severity: "error", message: "Missing column #{table}.#{column}"}
52
+ end
53
+ end
54
+
55
+ # Check critical indexes exist
56
+ index_names = @store.db["SELECT name FROM sqlite_master WHERE type='index'"]
57
+ .all.map { |r| r[:name] }
58
+ missing_indexes = CRITICAL_INDEXES - index_names.map(&:to_sym)
59
+ missing_indexes.each do |index|
60
+ issues << {severity: "warning", message: "Missing index: #{index}"}
61
+ end
62
+
63
+ # Check for orphaned records
64
+ check_orphaned_provenance(issues)
65
+ check_orphaned_fact_links(issues)
66
+ check_orphaned_tool_calls(issues)
67
+
68
+ # Check for invalid enum values
69
+ check_invalid_fact_scopes(issues)
70
+ check_invalid_fact_status(issues)
71
+ check_invalid_operation_status(issues)
72
+
73
+ # Check embedding dimensions
74
+ check_embedding_dimensions(issues)
75
+
76
+ # Record validation result
77
+ record_health_check(issues)
78
+
79
+ {
80
+ valid: issues.none? { |i| i[:severity] == "error" },
81
+ issues: issues
82
+ }
83
+ end
84
+
85
+ private
86
+
87
+ def check_orphaned_provenance(issues)
88
+ orphaned = @store.db[:provenance]
89
+ .left_join(:facts, id: :fact_id)
90
+ .where(Sequel[:facts][:id] => nil)
91
+ .count
92
+
93
+ if orphaned > 0
94
+ issues << {severity: "error", message: "#{orphaned} orphaned provenance record(s) without corresponding facts"}
95
+ end
96
+ end
97
+
98
+ def check_orphaned_fact_links(issues)
99
+ orphaned_from = @store.db[:fact_links]
100
+ .left_join(:facts, id: :from_fact_id)
101
+ .where(Sequel[:facts][:id] => nil)
102
+ .count
103
+
104
+ orphaned_to = @store.db[:fact_links]
105
+ .left_join(Sequel[:facts].as(:to_facts), id: :to_fact_id)
106
+ .where(Sequel[:to_facts][:id] => nil)
107
+ .count
108
+
109
+ total_orphaned = orphaned_from + orphaned_to
110
+ if total_orphaned > 0
111
+ issues << {severity: "error", message: "#{total_orphaned} orphaned fact_links record(s)"}
112
+ end
113
+ end
114
+
115
+ def check_orphaned_tool_calls(issues)
116
+ orphaned = @store.db[:tool_calls]
117
+ .left_join(:content_items, id: :content_item_id)
118
+ .where(Sequel[:content_items][:id] => nil)
119
+ .count
120
+
121
+ if orphaned > 0
122
+ issues << {severity: "warning", message: "#{orphaned} orphaned tool_calls record(s) without corresponding content_items"}
123
+ end
124
+ end
125
+
126
+ def check_invalid_fact_scopes(issues)
127
+ invalid = @store.facts
128
+ .where(Sequel.~(scope: %w[global project]))
129
+ .count
130
+
131
+ if invalid > 0
132
+ issues << {severity: "error", message: "#{invalid} fact(s) with invalid scope (must be 'global' or 'project')"}
133
+ end
134
+ end
135
+
136
+ def check_invalid_fact_status(issues)
137
+ valid_statuses = %w[active superseded]
138
+ invalid = @store.facts
139
+ .where(Sequel.~(status: valid_statuses))
140
+ .count
141
+
142
+ if invalid > 0
143
+ issues << {severity: "warning", message: "#{invalid} fact(s) with non-standard status"}
144
+ end
145
+ end
146
+
147
+ def check_invalid_operation_status(issues)
148
+ return unless @store.db.tables.include?(:operation_progress)
149
+
150
+ valid_statuses = %w[running completed failed]
151
+ invalid = @store.operation_progress
152
+ .where(Sequel.~(status: valid_statuses))
153
+ .count
154
+
155
+ if invalid > 0
156
+ issues << {severity: "error", message: "#{invalid} operation(s) with invalid status"}
157
+ end
158
+ end
159
+
160
+ def check_embedding_dimensions(issues)
161
+ # Check that all embeddings have correct dimensions (384)
162
+ facts_with_embeddings = @store.facts
163
+ .where(Sequel.~(embedding_json: nil))
164
+ .select(:id, :embedding_json)
165
+ .limit(10) # Sample first 10
166
+
167
+ facts_with_embeddings.each do |fact|
168
+ embedding = JSON.parse(fact[:embedding_json])
169
+ if embedding.size != 384
170
+ issues << {severity: "error", message: "Fact #{fact[:id]} has embedding with incorrect dimensions (#{embedding.size}, expected 384)"}
171
+ break # Only report first occurrence
172
+ end
173
+ end
174
+ rescue JSON::ParserError
175
+ issues << {severity: "error", message: "Invalid JSON in embedding_json column"}
176
+ end
177
+
178
+ def record_health_check(issues)
179
+ now = Time.now.utc.iso8601
180
+ version = @store.schema_version
181
+
182
+ # Get table counts for snapshot
183
+ table_counts = {}
184
+ @store.db.tables.each do |table|
185
+ table_counts[table.to_s] = @store.db[table].count
186
+ end
187
+
188
+ validation_status = if issues.any? { |i| i[:severity] == "error" }
189
+ "corrupt"
190
+ elsif issues.any?
191
+ "degraded"
192
+ else
193
+ "healthy"
194
+ end
195
+
196
+ @store.schema_health.insert(
197
+ checked_at: now,
198
+ schema_version: version,
199
+ validation_status: validation_status,
200
+ issues_json: issues.to_json,
201
+ table_counts_json: table_counts.to_json
202
+ )
203
+ end
204
+ end
205
+ end
206
+ end
@@ -2,22 +2,21 @@
2
2
 
3
3
  module ClaudeMemory
4
4
  module Ingest
5
+ # Strips privacy tags from transcript content before ingestion.
6
+ #
7
+ # Note: No tag count limit is enforced. The regex pattern /<tag>.*?<\/tag>/m
8
+ # is provably safe from ReDoS (non-greedy matching with clear delimiters).
9
+ # Performance is O(n) and excellent even with 1000+ tags (~0.6ms).
10
+ # Long Claude sessions legitimately accumulate many tags (100-200+).
5
11
  class ContentSanitizer
6
12
  SYSTEM_TAGS = ["claude-memory-context"].freeze
7
13
  USER_TAGS = ["private", "no-memory", "secret"].freeze
8
- MAX_TAG_COUNT = 100
9
14
 
10
15
  def self.strip_tags(text)
11
16
  tags = Pure.all_tags
12
- validate_tag_count!(text, tags)
13
17
  Pure.strip_tags(text, tags)
14
18
  end
15
19
 
16
- def self.validate_tag_count!(text, tags)
17
- count = Pure.count_tags(text, tags)
18
- raise Error, "Too many privacy tags (#{count}), possible ReDoS attack" if count > MAX_TAG_COUNT
19
- end
20
-
21
20
  module Pure
22
21
  def self.all_tags
23
22
  @all_tags ||= begin