htm 0.0.11 → 0.0.14
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/.dictate.toml +46 -0
- data/.envrc +2 -0
- data/CHANGELOG.md +52 -2
- data/README.md +79 -0
- data/Rakefile +14 -2
- data/bin/htm_mcp.rb +94 -0
- data/config/database.yml +20 -13
- data/db/migrate/00010_add_soft_delete_to_associations.rb +29 -0
- data/db/migrate/00011_add_performance_indexes.rb +21 -0
- data/db/migrate/00012_add_tags_trigram_index.rb +18 -0
- data/db/migrate/00013_enable_lz4_compression.rb +43 -0
- data/db/schema.sql +49 -92
- data/docs/api/index.md +1 -1
- data/docs/api/yard/HTM.md +2 -4
- data/docs/architecture/index.md +1 -1
- data/docs/development/index.md +1 -1
- data/docs/getting-started/index.md +1 -1
- data/docs/guides/index.md +1 -1
- data/docs/images/telemetry-architecture.svg +153 -0
- data/docs/telemetry.md +391 -0
- data/examples/README.md +46 -1
- data/examples/cli_app/README.md +1 -1
- data/examples/cli_app/htm_cli.rb +1 -1
- data/examples/sinatra_app/app.rb +1 -1
- data/examples/telemetry/README.md +147 -0
- data/examples/telemetry/SETUP_README.md +169 -0
- data/examples/telemetry/demo.rb +498 -0
- data/examples/telemetry/grafana/dashboards/htm-metrics.json +457 -0
- data/lib/htm/configuration.rb +261 -70
- data/lib/htm/database.rb +46 -22
- data/lib/htm/embedding_service.rb +24 -14
- data/lib/htm/errors.rb +15 -1
- data/lib/htm/jobs/generate_embedding_job.rb +19 -0
- data/lib/htm/jobs/generate_propositions_job.rb +103 -0
- data/lib/htm/jobs/generate_tags_job.rb +24 -0
- data/lib/htm/loaders/markdown_chunker.rb +79 -0
- data/lib/htm/loaders/markdown_loader.rb +41 -15
- data/lib/htm/long_term_memory/fulltext_search.rb +138 -0
- data/lib/htm/long_term_memory/hybrid_search.rb +324 -0
- data/lib/htm/long_term_memory/node_operations.rb +209 -0
- data/lib/htm/long_term_memory/relevance_scorer.rb +355 -0
- data/lib/htm/long_term_memory/robot_operations.rb +34 -0
- data/lib/htm/long_term_memory/tag_operations.rb +428 -0
- data/lib/htm/long_term_memory/vector_search.rb +109 -0
- data/lib/htm/long_term_memory.rb +51 -1153
- data/lib/htm/models/node.rb +35 -2
- data/lib/htm/models/node_tag.rb +31 -0
- data/lib/htm/models/robot_node.rb +31 -0
- data/lib/htm/models/tag.rb +44 -0
- data/lib/htm/proposition_service.rb +169 -0
- data/lib/htm/query_cache.rb +214 -0
- data/lib/htm/sql_builder.rb +178 -0
- data/lib/htm/tag_service.rb +16 -6
- data/lib/htm/tasks.rb +8 -2
- data/lib/htm/telemetry.rb +224 -0
- data/lib/htm/version.rb +1 -1
- data/lib/htm.rb +64 -3
- data/lib/tasks/doc.rake +1 -1
- data/lib/tasks/htm.rake +259 -13
- data/mkdocs.yml +96 -96
- metadata +42 -16
- data/.aigcm_msg +0 -1
- data/.claude/settings.local.json +0 -95
- data/CLAUDE.md +0 -603
- data/examples/cli_app/temp.log +0 -93
- data/lib/htm/loaders/paragraph_chunker.rb +0 -112
- data/notes/ARCHITECTURE_REVIEW.md +0 -1167
- data/notes/IMPLEMENTATION_SUMMARY.md +0 -606
- data/notes/MULTI_FRAMEWORK_IMPLEMENTATION.md +0 -451
- data/notes/next_steps.md +0 -100
- data/notes/plan.md +0 -627
- data/notes/tag_ontology_enhancement_ideas.md +0 -222
- data/notes/timescaledb_removal_summary.md +0 -200
data/lib/htm/errors.rb
CHANGED
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
# frozen_string_literal: true
|
|
2
2
|
|
|
3
|
-
# HTM (Hierarchical
|
|
3
|
+
# HTM (Hierarchical Temporal Memory) error classes
|
|
4
4
|
#
|
|
5
5
|
# All HTM errors inherit from HTM::Error, allowing you to catch
|
|
6
6
|
# all HTM-related errors with a single rescue clause.
|
|
@@ -93,6 +93,20 @@ class HTM
|
|
|
93
93
|
#
|
|
94
94
|
class TagError < Error; end
|
|
95
95
|
|
|
96
|
+
# Raised when proposition extraction fails
|
|
97
|
+
#
|
|
98
|
+
# Common causes:
|
|
99
|
+
# - LLM provider API errors
|
|
100
|
+
# - Invalid proposition response format
|
|
101
|
+
# - Network connectivity issues
|
|
102
|
+
# - Model not available
|
|
103
|
+
#
|
|
104
|
+
# Note: This error is distinct from CircuitBreakerOpenError.
|
|
105
|
+
# PropositionError indicates a single failure, while CircuitBreakerOpenError
|
|
106
|
+
# indicates repeated failures have triggered protective circuit breaking.
|
|
107
|
+
#
|
|
108
|
+
class PropositionError < Error; end
|
|
109
|
+
|
|
96
110
|
# Raised when database operations fail
|
|
97
111
|
#
|
|
98
112
|
# Common causes:
|
|
@@ -36,6 +36,9 @@ class HTM
|
|
|
36
36
|
return
|
|
37
37
|
end
|
|
38
38
|
|
|
39
|
+
provider = HTM.configuration.embedding_provider.to_s
|
|
40
|
+
start_time = Process.clock_gettime(Process::CLOCK_MONOTONIC)
|
|
41
|
+
|
|
39
42
|
begin
|
|
40
43
|
HTM.logger.debug "GenerateEmbeddingJob: Generating embedding for node #{node_id}"
|
|
41
44
|
|
|
@@ -45,17 +48,33 @@ class HTM
|
|
|
45
48
|
# Update node with processed embedding
|
|
46
49
|
node.update!(embedding: result[:storage_embedding])
|
|
47
50
|
|
|
51
|
+
# Record success metrics
|
|
52
|
+
elapsed_ms = ((Process.clock_gettime(Process::CLOCK_MONOTONIC) - start_time) * 1000).round
|
|
53
|
+
HTM::Telemetry.embedding_latency.record(elapsed_ms, attributes: { 'provider' => provider, 'status' => 'success' })
|
|
54
|
+
HTM::Telemetry.job_counter.add(1, attributes: { 'job' => 'embedding', 'status' => 'success' })
|
|
55
|
+
|
|
48
56
|
HTM.logger.info "GenerateEmbeddingJob: Successfully generated embedding for node #{node_id} (#{result[:dimension]} dimensions)"
|
|
49
57
|
|
|
50
58
|
rescue HTM::CircuitBreakerOpenError => e
|
|
51
59
|
# Circuit breaker is open - service is unavailable, will retry later
|
|
60
|
+
HTM::Telemetry.job_counter.add(1, attributes: { 'job' => 'embedding', 'status' => 'circuit_open' })
|
|
52
61
|
HTM.logger.warn "GenerateEmbeddingJob: Circuit breaker open for node #{node_id}, will retry when service recovers"
|
|
53
62
|
|
|
54
63
|
rescue HTM::EmbeddingError => e
|
|
64
|
+
# Record failure metrics
|
|
65
|
+
elapsed_ms = ((Process.clock_gettime(Process::CLOCK_MONOTONIC) - start_time) * 1000).round
|
|
66
|
+
HTM::Telemetry.embedding_latency.record(elapsed_ms, attributes: { 'provider' => provider, 'status' => 'error' })
|
|
67
|
+
HTM::Telemetry.job_counter.add(1, attributes: { 'job' => 'embedding', 'status' => 'error' })
|
|
68
|
+
|
|
55
69
|
# Log embedding-specific errors
|
|
56
70
|
HTM.logger.error "GenerateEmbeddingJob: Embedding generation failed for node #{node_id}: #{e.message}"
|
|
57
71
|
|
|
58
72
|
rescue StandardError => e
|
|
73
|
+
# Record failure metrics
|
|
74
|
+
elapsed_ms = ((Process.clock_gettime(Process::CLOCK_MONOTONIC) - start_time) * 1000).round
|
|
75
|
+
HTM::Telemetry.embedding_latency.record(elapsed_ms, attributes: { 'provider' => provider, 'status' => 'error' })
|
|
76
|
+
HTM::Telemetry.job_counter.add(1, attributes: { 'job' => 'embedding', 'status' => 'error' })
|
|
77
|
+
|
|
59
78
|
# Log unexpected errors
|
|
60
79
|
HTM.logger.error "GenerateEmbeddingJob: Unexpected error for node #{node_id}: #{e.class.name} - #{e.message}"
|
|
61
80
|
HTM.logger.debug e.backtrace.first(5).join("\n")
|
|
@@ -0,0 +1,103 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
require_relative '../errors'
|
|
4
|
+
require_relative '../models/node'
|
|
5
|
+
require_relative '../proposition_service'
|
|
6
|
+
|
|
7
|
+
class HTM
|
|
8
|
+
module Jobs
|
|
9
|
+
# Background job to extract propositions from nodes and create new nodes
|
|
10
|
+
#
|
|
11
|
+
# This job is enqueued after a node is saved (if proposition extraction is enabled).
|
|
12
|
+
# It uses LLM to extract atomic factual propositions from node content and
|
|
13
|
+
# creates new nodes for each proposition. Proposition nodes are marked with
|
|
14
|
+
# metadata to prevent recursive extraction.
|
|
15
|
+
#
|
|
16
|
+
# @see PropositionService
|
|
17
|
+
#
|
|
18
|
+
class GeneratePropositionsJob
|
|
19
|
+
# Generate propositions for a node
|
|
20
|
+
#
|
|
21
|
+
# Uses the configured proposition extractor (HTM.extract_propositions) which
|
|
22
|
+
# delegates to the application-provided or default RubyLLM implementation.
|
|
23
|
+
#
|
|
24
|
+
# @param node_id [Integer] ID of the node to process
|
|
25
|
+
# @param robot_id [Integer] ID of the robot that owns this node
|
|
26
|
+
#
|
|
27
|
+
def self.perform(node_id:, robot_id:)
|
|
28
|
+
node = HTM::Models::Node.find_by(id: node_id)
|
|
29
|
+
|
|
30
|
+
unless node
|
|
31
|
+
HTM.logger.warn "GeneratePropositionsJob: Node #{node_id} not found"
|
|
32
|
+
return
|
|
33
|
+
end
|
|
34
|
+
|
|
35
|
+
# Skip if this node is already a proposition (prevent recursion)
|
|
36
|
+
if node.metadata&.dig('is_proposition')
|
|
37
|
+
HTM.logger.debug "GeneratePropositionsJob: Node #{node_id} is a proposition, skipping"
|
|
38
|
+
return
|
|
39
|
+
end
|
|
40
|
+
|
|
41
|
+
begin
|
|
42
|
+
HTM.logger.debug "GeneratePropositionsJob: Extracting propositions for node #{node_id}"
|
|
43
|
+
|
|
44
|
+
# Extract propositions using PropositionService
|
|
45
|
+
propositions = HTM::PropositionService.extract(node.content)
|
|
46
|
+
|
|
47
|
+
if propositions.empty?
|
|
48
|
+
HTM.logger.debug "GeneratePropositionsJob: No propositions extracted for node #{node_id}"
|
|
49
|
+
return
|
|
50
|
+
end
|
|
51
|
+
|
|
52
|
+
HTM.logger.info "GeneratePropositionsJob: Extracted #{propositions.length} propositions for node #{node_id}"
|
|
53
|
+
|
|
54
|
+
# Create a node for each proposition
|
|
55
|
+
created_count = 0
|
|
56
|
+
propositions.each do |proposition_text|
|
|
57
|
+
# Calculate token count
|
|
58
|
+
token_count = HTM.count_tokens(proposition_text)
|
|
59
|
+
|
|
60
|
+
# Create proposition node with is_proposition marker
|
|
61
|
+
proposition_node = HTM::Models::Node.create!(
|
|
62
|
+
content: proposition_text,
|
|
63
|
+
token_count: token_count,
|
|
64
|
+
metadata: { is_proposition: true, source_node_id: node_id }
|
|
65
|
+
)
|
|
66
|
+
|
|
67
|
+
# Link to robot via RobotNode
|
|
68
|
+
HTM::Models::RobotNode.find_or_create_by!(
|
|
69
|
+
robot_id: robot_id,
|
|
70
|
+
node_id: proposition_node.id
|
|
71
|
+
)
|
|
72
|
+
|
|
73
|
+
# Enqueue embedding and tag jobs for the new proposition node
|
|
74
|
+
# (but NOT another propositions job - the is_proposition marker prevents that)
|
|
75
|
+
HTM::JobAdapter.enqueue(HTM::Jobs::GenerateEmbeddingJob, node_id: proposition_node.id)
|
|
76
|
+
HTM::JobAdapter.enqueue(HTM::Jobs::GenerateTagsJob, node_id: proposition_node.id)
|
|
77
|
+
|
|
78
|
+
created_count += 1
|
|
79
|
+
end
|
|
80
|
+
|
|
81
|
+
HTM.logger.info "GeneratePropositionsJob: Created #{created_count} proposition nodes from node #{node_id}"
|
|
82
|
+
|
|
83
|
+
rescue HTM::CircuitBreakerOpenError
|
|
84
|
+
# Circuit breaker is open - service is unavailable, will retry later
|
|
85
|
+
HTM.logger.warn "GeneratePropositionsJob: Circuit breaker open for node #{node_id}, will retry when service recovers"
|
|
86
|
+
|
|
87
|
+
rescue HTM::PropositionError => e
|
|
88
|
+
# Log proposition-specific errors
|
|
89
|
+
HTM.logger.error "GeneratePropositionsJob: Proposition extraction failed for node #{node_id}: #{e.message}"
|
|
90
|
+
|
|
91
|
+
rescue ActiveRecord::RecordInvalid => e
|
|
92
|
+
# Log validation errors
|
|
93
|
+
HTM.logger.error "GeneratePropositionsJob: Database validation failed for node #{node_id}: #{e.message}"
|
|
94
|
+
|
|
95
|
+
rescue StandardError => e
|
|
96
|
+
# Log unexpected errors
|
|
97
|
+
HTM.logger.error "GeneratePropositionsJob: Unexpected error for node #{node_id}: #{e.class.name} - #{e.message}"
|
|
98
|
+
HTM.logger.debug e.backtrace.first(5).join("\n")
|
|
99
|
+
end
|
|
100
|
+
end
|
|
101
|
+
end
|
|
102
|
+
end
|
|
103
|
+
end
|
|
@@ -33,6 +33,9 @@ class HTM
|
|
|
33
33
|
return
|
|
34
34
|
end
|
|
35
35
|
|
|
36
|
+
provider = HTM.configuration.tag_provider.to_s
|
|
37
|
+
start_time = Process.clock_gettime(Process::CLOCK_MONOTONIC)
|
|
38
|
+
|
|
36
39
|
begin
|
|
37
40
|
HTM.logger.debug "GenerateTagsJob: Extracting tags for node #{node_id}"
|
|
38
41
|
|
|
@@ -61,21 +64,42 @@ class HTM
|
|
|
61
64
|
)
|
|
62
65
|
end
|
|
63
66
|
|
|
67
|
+
# Record success metrics
|
|
68
|
+
elapsed_ms = ((Process.clock_gettime(Process::CLOCK_MONOTONIC) - start_time) * 1000).round
|
|
69
|
+
HTM::Telemetry.tag_latency.record(elapsed_ms, attributes: { 'provider' => provider, 'status' => 'success' })
|
|
70
|
+
HTM::Telemetry.job_counter.add(1, attributes: { 'job' => 'tags', 'status' => 'success' })
|
|
71
|
+
|
|
64
72
|
HTM.logger.info "GenerateTagsJob: Successfully generated #{tag_names.length} tags for node #{node_id}: #{tag_names.join(', ')}"
|
|
65
73
|
|
|
66
74
|
rescue HTM::CircuitBreakerOpenError => e
|
|
67
75
|
# Circuit breaker is open - service is unavailable, will retry later
|
|
76
|
+
HTM::Telemetry.job_counter.add(1, attributes: { 'job' => 'tags', 'status' => 'circuit_open' })
|
|
68
77
|
HTM.logger.warn "GenerateTagsJob: Circuit breaker open for node #{node_id}, will retry when service recovers"
|
|
69
78
|
|
|
70
79
|
rescue HTM::TagError => e
|
|
80
|
+
# Record failure metrics
|
|
81
|
+
elapsed_ms = ((Process.clock_gettime(Process::CLOCK_MONOTONIC) - start_time) * 1000).round
|
|
82
|
+
HTM::Telemetry.tag_latency.record(elapsed_ms, attributes: { 'provider' => provider, 'status' => 'error' })
|
|
83
|
+
HTM::Telemetry.job_counter.add(1, attributes: { 'job' => 'tags', 'status' => 'error' })
|
|
84
|
+
|
|
71
85
|
# Log tag-specific errors
|
|
72
86
|
HTM.logger.error "GenerateTagsJob: Tag generation failed for node #{node_id}: #{e.message}"
|
|
73
87
|
|
|
74
88
|
rescue ActiveRecord::RecordInvalid => e
|
|
89
|
+
# Record failure metrics
|
|
90
|
+
elapsed_ms = ((Process.clock_gettime(Process::CLOCK_MONOTONIC) - start_time) * 1000).round
|
|
91
|
+
HTM::Telemetry.tag_latency.record(elapsed_ms, attributes: { 'provider' => provider, 'status' => 'error' })
|
|
92
|
+
HTM::Telemetry.job_counter.add(1, attributes: { 'job' => 'tags', 'status' => 'error' })
|
|
93
|
+
|
|
75
94
|
# Log validation errors
|
|
76
95
|
HTM.logger.error "GenerateTagsJob: Database validation failed for node #{node_id}: #{e.message}"
|
|
77
96
|
|
|
78
97
|
rescue StandardError => e
|
|
98
|
+
# Record failure metrics
|
|
99
|
+
elapsed_ms = ((Process.clock_gettime(Process::CLOCK_MONOTONIC) - start_time) * 1000).round
|
|
100
|
+
HTM::Telemetry.tag_latency.record(elapsed_ms, attributes: { 'provider' => provider, 'status' => 'error' })
|
|
101
|
+
HTM::Telemetry.job_counter.add(1, attributes: { 'job' => 'tags', 'status' => 'error' })
|
|
102
|
+
|
|
79
103
|
# Log unexpected errors
|
|
80
104
|
HTM.logger.error "GenerateTagsJob: Unexpected error for node #{node_id}: #{e.class.name} - #{e.message}"
|
|
81
105
|
HTM.logger.debug e.backtrace.first(5).join("\n")
|
|
@@ -0,0 +1,79 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
require 'baran'
|
|
4
|
+
|
|
5
|
+
class HTM
|
|
6
|
+
module Loaders
|
|
7
|
+
# Markdown-aware text chunker using Baran
|
|
8
|
+
#
|
|
9
|
+
# Wraps Baran::MarkdownSplitter to provide intelligent text chunking
|
|
10
|
+
# that respects markdown structure (headers, code blocks, etc.).
|
|
11
|
+
#
|
|
12
|
+
# @example Basic usage
|
|
13
|
+
# chunker = MarkdownChunker.new
|
|
14
|
+
# chunks = chunker.chunk("# Header\n\nParagraph text.\n\n## Subheader\n\nMore text.")
|
|
15
|
+
# # => ["# Header\n\nParagraph text.", "## Subheader\n\nMore text."]
|
|
16
|
+
#
|
|
17
|
+
# @example With custom chunk size
|
|
18
|
+
# chunker = MarkdownChunker.new(chunk_size: 512, chunk_overlap: 50)
|
|
19
|
+
# chunks = chunker.chunk(long_text)
|
|
20
|
+
#
|
|
21
|
+
# @example With full metadata (includes cursor positions)
|
|
22
|
+
# chunker = MarkdownChunker.new
|
|
23
|
+
# chunks = chunker.chunk_with_metadata(text)
|
|
24
|
+
# # => [{ text: "...", cursor: 0, metadata: nil }, { text: "...", cursor: 156, metadata: nil }]
|
|
25
|
+
#
|
|
26
|
+
class MarkdownChunker
|
|
27
|
+
# @param chunk_size [Integer] Maximum characters per chunk (default: from config or 1024)
|
|
28
|
+
# @param chunk_overlap [Integer] Character overlap between chunks (default: from config or 64)
|
|
29
|
+
def initialize(chunk_size: nil, chunk_overlap: nil)
|
|
30
|
+
@chunk_size = chunk_size || HTM.configuration.chunk_size
|
|
31
|
+
@chunk_overlap = chunk_overlap || HTM.configuration.chunk_overlap
|
|
32
|
+
|
|
33
|
+
@splitter = Baran::MarkdownSplitter.new(
|
|
34
|
+
chunk_size: @chunk_size,
|
|
35
|
+
chunk_overlap: @chunk_overlap
|
|
36
|
+
)
|
|
37
|
+
end
|
|
38
|
+
|
|
39
|
+
# Split text into markdown-aware chunks (text only)
|
|
40
|
+
#
|
|
41
|
+
# @param text [String] Text to chunk
|
|
42
|
+
# @return [Array<String>] Array of text chunks
|
|
43
|
+
#
|
|
44
|
+
def chunk(text)
|
|
45
|
+
return [] if text.nil? || text.strip.empty?
|
|
46
|
+
|
|
47
|
+
# Normalize line endings
|
|
48
|
+
normalized = text.gsub(/\r\n?/, "\n")
|
|
49
|
+
|
|
50
|
+
# Use Baran's MarkdownSplitter
|
|
51
|
+
result = @splitter.chunks(normalized)
|
|
52
|
+
|
|
53
|
+
# Extract text from chunk hashes, filter empty
|
|
54
|
+
result.map { |chunk| chunk[:text].strip }.reject(&:empty?)
|
|
55
|
+
end
|
|
56
|
+
|
|
57
|
+
# Split text and return full chunk data (with cursor positions)
|
|
58
|
+
#
|
|
59
|
+
# Returns Baran's full output including:
|
|
60
|
+
# - :text [String] The chunk content
|
|
61
|
+
# - :cursor [Integer] Character offset where chunk starts in original text
|
|
62
|
+
#
|
|
63
|
+
# @param text [String] Text to chunk
|
|
64
|
+
# @return [Array<Hash>] Array of chunk hashes with :text and :cursor
|
|
65
|
+
#
|
|
66
|
+
def chunk_with_metadata(text)
|
|
67
|
+
return [] if text.nil? || text.strip.empty?
|
|
68
|
+
|
|
69
|
+
# Normalize line endings
|
|
70
|
+
normalized = text.gsub(/\r\n?/, "\n")
|
|
71
|
+
|
|
72
|
+
# Use Baran's MarkdownSplitter - returns [{text:, cursor:}, ...]
|
|
73
|
+
@splitter.chunks(normalized)
|
|
74
|
+
end
|
|
75
|
+
|
|
76
|
+
attr_reader :chunk_size, :chunk_overlap
|
|
77
|
+
end
|
|
78
|
+
end
|
|
79
|
+
end
|
|
@@ -26,9 +26,14 @@ class HTM
|
|
|
26
26
|
MAX_FILE_SIZE = 10 * 1024 * 1024 # 10 MB maximum file size
|
|
27
27
|
|
|
28
28
|
# @param htm_instance [HTM] The HTM instance to use for storing nodes
|
|
29
|
-
|
|
29
|
+
# @param chunk_size [Integer] Maximum characters per chunk (default: from config)
|
|
30
|
+
# @param chunk_overlap [Integer] Character overlap between chunks (default: from config)
|
|
31
|
+
def initialize(htm_instance, chunk_size: nil, chunk_overlap: nil)
|
|
30
32
|
@htm = htm_instance
|
|
31
|
-
@chunker =
|
|
33
|
+
@chunker = MarkdownChunker.new(
|
|
34
|
+
chunk_size: chunk_size,
|
|
35
|
+
chunk_overlap: chunk_overlap
|
|
36
|
+
)
|
|
32
37
|
end
|
|
33
38
|
|
|
34
39
|
# Load a single markdown file into long-term memory
|
|
@@ -89,19 +94,19 @@ class HTM
|
|
|
89
94
|
# Parse frontmatter and body
|
|
90
95
|
frontmatter, body = extract_frontmatter(content)
|
|
91
96
|
|
|
92
|
-
# Chunk the body
|
|
93
|
-
chunks = @chunker.
|
|
97
|
+
# Chunk the body with metadata (includes cursor positions)
|
|
98
|
+
chunks = @chunker.chunk_with_metadata(body)
|
|
94
99
|
|
|
95
100
|
# Prepend frontmatter to first chunk if present
|
|
96
101
|
if frontmatter.any? && chunks.any?
|
|
97
102
|
frontmatter_yaml = YAML.dump(frontmatter).sub(/\A---\n/, "---\n")
|
|
98
|
-
chunks[0] = "#{frontmatter_yaml}---\n\n#{chunks[0]}"
|
|
103
|
+
chunks[0][:text] = "#{frontmatter_yaml}---\n\n#{chunks[0][:text]}"
|
|
99
104
|
end
|
|
100
105
|
|
|
101
106
|
# Save source first (need ID for node association)
|
|
102
107
|
source.save! if source.new_record?
|
|
103
108
|
|
|
104
|
-
# Sync chunks to database
|
|
109
|
+
# Sync chunks to database (chunks now include cursor positions)
|
|
105
110
|
result = sync_chunks(source, chunks)
|
|
106
111
|
|
|
107
112
|
# Update source record
|
|
@@ -181,7 +186,7 @@ class HTM
|
|
|
181
186
|
# Sync chunks to database, handling updates and deletions
|
|
182
187
|
#
|
|
183
188
|
# @param source [FileSource] The source record
|
|
184
|
-
# @param chunks [Array<
|
|
189
|
+
# @param chunks [Array<Hash>] New chunks with :text and :cursor keys
|
|
185
190
|
# @return [Hash] Sync statistics
|
|
186
191
|
#
|
|
187
192
|
def sync_chunks(source, chunks)
|
|
@@ -197,12 +202,16 @@ class HTM
|
|
|
197
202
|
# Track which existing nodes we've matched
|
|
198
203
|
matched_hashes = Set.new
|
|
199
204
|
|
|
200
|
-
# Process each new chunk
|
|
201
|
-
chunks.each_with_index do |
|
|
205
|
+
# Process each new chunk (chunks are now Hashes with :text and :cursor)
|
|
206
|
+
chunks.each_with_index do |chunk_data, position|
|
|
207
|
+
chunk_content = chunk_data[:text].strip
|
|
208
|
+
chunk_cursor = chunk_data[:cursor]
|
|
209
|
+
next if chunk_content.empty?
|
|
210
|
+
|
|
202
211
|
chunk_hash = HTM::Models::Node.generate_content_hash(chunk_content)
|
|
203
212
|
|
|
204
213
|
if existing_by_hash[chunk_hash]
|
|
205
|
-
# Chunk exists - update position if needed, restore if soft-deleted
|
|
214
|
+
# Chunk exists - update position/cursor if needed, restore if soft-deleted
|
|
206
215
|
node = existing_by_hash[chunk_hash]
|
|
207
216
|
matched_hashes << chunk_hash
|
|
208
217
|
|
|
@@ -210,13 +219,20 @@ class HTM
|
|
|
210
219
|
changes[:chunk_position] = position if node.chunk_position != position
|
|
211
220
|
changes[:deleted_at] = nil if node.deleted_at.present?
|
|
212
221
|
|
|
222
|
+
# Update cursor in metadata if changed
|
|
223
|
+
current_cursor = node.metadata&.dig('cursor')
|
|
224
|
+
if current_cursor != chunk_cursor
|
|
225
|
+
new_metadata = (node.metadata || {}).merge('cursor' => chunk_cursor)
|
|
226
|
+
changes[:metadata] = new_metadata
|
|
227
|
+
end
|
|
228
|
+
|
|
213
229
|
if changes.any?
|
|
214
230
|
node.update!(changes)
|
|
215
231
|
updated += 1
|
|
216
232
|
end
|
|
217
233
|
else
|
|
218
|
-
# New chunk - create node
|
|
219
|
-
node = create_chunk_node(source, chunk_content, position)
|
|
234
|
+
# New chunk - create node with cursor in metadata
|
|
235
|
+
node = create_chunk_node(source, chunk_content, position, cursor: chunk_cursor)
|
|
220
236
|
created += 1 if node
|
|
221
237
|
end
|
|
222
238
|
end
|
|
@@ -238,11 +254,15 @@ class HTM
|
|
|
238
254
|
# @param source [FileSource] The source record
|
|
239
255
|
# @param content [String] Chunk content
|
|
240
256
|
# @param position [Integer] Position in file (0-indexed)
|
|
257
|
+
# @param cursor [Integer] Character offset in original file
|
|
241
258
|
# @return [Node, nil] The created node or nil if duplicate
|
|
242
259
|
#
|
|
243
|
-
def create_chunk_node(source, content, position)
|
|
260
|
+
def create_chunk_node(source, content, position, cursor: nil)
|
|
261
|
+
# Build metadata with cursor position (file path is in source, not duplicated here)
|
|
262
|
+
chunk_metadata = cursor ? { 'cursor' => cursor } : {}
|
|
263
|
+
|
|
244
264
|
# Use remember to get proper embedding/tag processing
|
|
245
|
-
node_id = @htm.remember(content)
|
|
265
|
+
node_id = @htm.remember(content, metadata: chunk_metadata)
|
|
246
266
|
|
|
247
267
|
# Update with source reference
|
|
248
268
|
node = HTM::Models::Node.find(node_id)
|
|
@@ -254,7 +274,13 @@ class HTM
|
|
|
254
274
|
# Find and link to this source
|
|
255
275
|
existing = HTM::Models::Node.find_by_content(content)
|
|
256
276
|
if existing && existing.source_id.nil?
|
|
257
|
-
|
|
277
|
+
# Merge cursor into existing metadata
|
|
278
|
+
new_metadata = (existing.metadata || {}).merge('cursor' => cursor) if cursor
|
|
279
|
+
existing.update!(
|
|
280
|
+
source_id: source.id,
|
|
281
|
+
chunk_position: position,
|
|
282
|
+
metadata: new_metadata || existing.metadata
|
|
283
|
+
)
|
|
258
284
|
end
|
|
259
285
|
existing
|
|
260
286
|
end
|
|
@@ -0,0 +1,138 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
class HTM
|
|
4
|
+
class LongTermMemory
|
|
5
|
+
# Full-text search using PostgreSQL tsvector and pg_trgm
|
|
6
|
+
#
|
|
7
|
+
# Performs keyword-based search using:
|
|
8
|
+
# - PostgreSQL full-text search (tsvector/tsquery) for stemmed word matching
|
|
9
|
+
# - Trigram fuzzy matching (pg_trgm) for typos and partial words
|
|
10
|
+
# - Combined scoring: tsvector matches rank higher, trigram provides fallback
|
|
11
|
+
#
|
|
12
|
+
# Results are cached for performance.
|
|
13
|
+
#
|
|
14
|
+
# Security: All queries use parameterized placeholders to prevent SQL injection.
|
|
15
|
+
#
|
|
16
|
+
module FulltextSearch
|
|
17
|
+
# Maximum results to prevent DoS via unbounded queries
|
|
18
|
+
MAX_FULLTEXT_LIMIT = 1000
|
|
19
|
+
|
|
20
|
+
# Minimum trigram similarity threshold (0.0-1.0)
|
|
21
|
+
# Lower = more fuzzy matches, higher = stricter matching
|
|
22
|
+
TRIGRAM_SIMILARITY_THRESHOLD = 0.1
|
|
23
|
+
|
|
24
|
+
# Score boost for tsvector matches over trigram matches
|
|
25
|
+
# Ensures exact word matches rank above fuzzy matches
|
|
26
|
+
TSVECTOR_SCORE_BOOST = 1.0
|
|
27
|
+
|
|
28
|
+
# Full-text search
|
|
29
|
+
#
|
|
30
|
+
# @param timeframe [Range] Time range to search
|
|
31
|
+
# @param query [String] Search query
|
|
32
|
+
# @param limit [Integer] Maximum results (capped at MAX_FULLTEXT_LIMIT)
|
|
33
|
+
# @param metadata [Hash] Filter by metadata fields (default: {})
|
|
34
|
+
# @return [Array<Hash>] Matching nodes
|
|
35
|
+
#
|
|
36
|
+
def search_fulltext(timeframe:, query:, limit:, metadata: {})
|
|
37
|
+
# Enforce limit to prevent DoS
|
|
38
|
+
safe_limit = [[limit.to_i, 1].max, MAX_FULLTEXT_LIMIT].min
|
|
39
|
+
|
|
40
|
+
start_time = Process.clock_gettime(Process::CLOCK_MONOTONIC)
|
|
41
|
+
result = @cache.fetch(:fulltext, timeframe, query, safe_limit, metadata) do
|
|
42
|
+
search_fulltext_uncached(
|
|
43
|
+
timeframe: timeframe,
|
|
44
|
+
query: query,
|
|
45
|
+
limit: safe_limit,
|
|
46
|
+
metadata: metadata
|
|
47
|
+
)
|
|
48
|
+
end
|
|
49
|
+
elapsed_ms = ((Process.clock_gettime(Process::CLOCK_MONOTONIC) - start_time) * 1000).round
|
|
50
|
+
HTM::Telemetry.search_latency.record(elapsed_ms, attributes: { 'strategy' => 'fulltext' })
|
|
51
|
+
result
|
|
52
|
+
end
|
|
53
|
+
|
|
54
|
+
private
|
|
55
|
+
|
|
56
|
+
# Uncached full-text search combining tsvector and trigram matching
|
|
57
|
+
#
|
|
58
|
+
# Uses UNION to combine:
|
|
59
|
+
# 1. tsvector matches (stemmed words, high priority)
|
|
60
|
+
# 2. trigram matches (fuzzy/partial, lower priority fallback)
|
|
61
|
+
#
|
|
62
|
+
# Deduplicates by taking highest score per node.
|
|
63
|
+
#
|
|
64
|
+
# @param timeframe [nil, Range, Array<Range>] Time range(s) to search (nil = no filter)
|
|
65
|
+
# @param query [String] Search query
|
|
66
|
+
# @param limit [Integer] Maximum results
|
|
67
|
+
# @param metadata [Hash] Filter by metadata fields (default: {})
|
|
68
|
+
# @return [Array<Hash>] Matching nodes
|
|
69
|
+
#
|
|
70
|
+
def search_fulltext_uncached(timeframe:, query:, limit:, metadata: {})
|
|
71
|
+
# Build filter conditions
|
|
72
|
+
timeframe_condition = HTM::SqlBuilder.timeframe_condition(timeframe)
|
|
73
|
+
metadata_condition = HTM::SqlBuilder.metadata_condition(metadata)
|
|
74
|
+
|
|
75
|
+
additional_conditions = []
|
|
76
|
+
additional_conditions << timeframe_condition if timeframe_condition
|
|
77
|
+
additional_conditions << metadata_condition if metadata_condition
|
|
78
|
+
additional_sql = additional_conditions.any? ? "AND #{additional_conditions.join(' AND ')}" : ""
|
|
79
|
+
|
|
80
|
+
# Combined tsvector + trigram search
|
|
81
|
+
# tsvector matches get boosted score, trigram provides fuzzy fallback
|
|
82
|
+
sql = <<~SQL
|
|
83
|
+
WITH tsvector_matches AS (
|
|
84
|
+
-- Primary: tsvector full-text search (stemmed word matching)
|
|
85
|
+
SELECT id, content, access_count, created_at, token_count,
|
|
86
|
+
(? + ts_rank(to_tsvector('english', content), plainto_tsquery('english', ?))) as score,
|
|
87
|
+
'tsvector' as match_type
|
|
88
|
+
FROM nodes
|
|
89
|
+
WHERE deleted_at IS NULL
|
|
90
|
+
AND to_tsvector('english', content) @@ plainto_tsquery('english', ?)
|
|
91
|
+
#{additional_sql}
|
|
92
|
+
),
|
|
93
|
+
trigram_matches AS (
|
|
94
|
+
-- Fallback: trigram fuzzy matching (typos, partial words)
|
|
95
|
+
SELECT id, content, access_count, created_at, token_count,
|
|
96
|
+
similarity(content, ?) as score,
|
|
97
|
+
'trigram' as match_type
|
|
98
|
+
FROM nodes
|
|
99
|
+
WHERE deleted_at IS NULL
|
|
100
|
+
AND similarity(content, ?) >= ?
|
|
101
|
+
AND id NOT IN (SELECT id FROM tsvector_matches)
|
|
102
|
+
#{additional_sql}
|
|
103
|
+
),
|
|
104
|
+
combined AS (
|
|
105
|
+
SELECT * FROM tsvector_matches
|
|
106
|
+
UNION ALL
|
|
107
|
+
SELECT * FROM trigram_matches
|
|
108
|
+
)
|
|
109
|
+
SELECT id, content, access_count, created_at, token_count,
|
|
110
|
+
MAX(score) as rank, match_type
|
|
111
|
+
FROM combined
|
|
112
|
+
GROUP BY id, content, access_count, created_at, token_count, match_type
|
|
113
|
+
ORDER BY rank DESC
|
|
114
|
+
LIMIT ?
|
|
115
|
+
SQL
|
|
116
|
+
|
|
117
|
+
result = ActiveRecord::Base.connection.select_all(
|
|
118
|
+
ActiveRecord::Base.sanitize_sql_array([
|
|
119
|
+
sql,
|
|
120
|
+
TSVECTOR_SCORE_BOOST, # boost for tsvector
|
|
121
|
+
query, # ts_rank query
|
|
122
|
+
query, # tsvector match query
|
|
123
|
+
query, # trigram similarity query
|
|
124
|
+
query, # trigram match query
|
|
125
|
+
TRIGRAM_SIMILARITY_THRESHOLD,
|
|
126
|
+
limit
|
|
127
|
+
])
|
|
128
|
+
)
|
|
129
|
+
|
|
130
|
+
# Track access for retrieved nodes
|
|
131
|
+
node_ids = result.map { |r| r['id'] }
|
|
132
|
+
track_access(node_ids)
|
|
133
|
+
|
|
134
|
+
result.to_a
|
|
135
|
+
end
|
|
136
|
+
end
|
|
137
|
+
end
|
|
138
|
+
end
|