htm 0.0.11 → 0.0.15

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (126) hide show
  1. checksums.yaml +4 -4
  2. data/.dictate.toml +46 -0
  3. data/.envrc +2 -0
  4. data/CHANGELOG.md +85 -2
  5. data/README.md +348 -79
  6. data/Rakefile +14 -2
  7. data/bin/htm_mcp.rb +94 -0
  8. data/config/database.yml +20 -13
  9. data/db/migrate/00003_create_file_sources.rb +5 -0
  10. data/db/migrate/00004_create_nodes.rb +17 -0
  11. data/db/migrate/00005_create_tags.rb +7 -0
  12. data/db/migrate/00006_create_node_tags.rb +2 -0
  13. data/db/migrate/00007_create_robot_nodes.rb +7 -0
  14. data/db/schema.sql +69 -100
  15. data/docs/api/index.md +1 -1
  16. data/docs/api/yard/HTM/Configuration.md +54 -0
  17. data/docs/api/yard/HTM/Database.md +13 -10
  18. data/docs/api/yard/HTM/EmbeddingService.md +5 -1
  19. data/docs/api/yard/HTM/LongTermMemory.md +18 -277
  20. data/docs/api/yard/HTM/PropositionError.md +18 -0
  21. data/docs/api/yard/HTM/PropositionService.md +66 -0
  22. data/docs/api/yard/HTM/QueryCache.md +88 -0
  23. data/docs/api/yard/HTM/RobotGroup.md +481 -0
  24. data/docs/api/yard/HTM/SqlBuilder.md +108 -0
  25. data/docs/api/yard/HTM/TagService.md +4 -0
  26. data/docs/api/yard/HTM/Telemetry/NullInstrument.md +13 -0
  27. data/docs/api/yard/HTM/Telemetry/NullMeter.md +15 -0
  28. data/docs/api/yard/HTM/Telemetry.md +109 -0
  29. data/docs/api/yard/HTM/WorkingMemoryChannel.md +176 -0
  30. data/docs/api/yard/HTM.md +8 -22
  31. data/docs/api/yard/index.csv +102 -25
  32. data/docs/api/yard-reference.md +8 -0
  33. data/docs/architecture/index.md +1 -1
  34. data/docs/assets/images/multi-provider-failover.svg +51 -0
  35. data/docs/assets/images/robot-group-architecture.svg +65 -0
  36. data/docs/database/README.md +3 -3
  37. data/docs/database/public.file_sources.svg +29 -21
  38. data/docs/database/public.node_tags.md +2 -0
  39. data/docs/database/public.node_tags.svg +53 -41
  40. data/docs/database/public.nodes.md +2 -0
  41. data/docs/database/public.nodes.svg +52 -40
  42. data/docs/database/public.robot_nodes.md +2 -0
  43. data/docs/database/public.robot_nodes.svg +30 -22
  44. data/docs/database/public.robots.svg +16 -12
  45. data/docs/database/public.tags.md +3 -0
  46. data/docs/database/public.tags.svg +41 -33
  47. data/docs/database/schema.json +66 -0
  48. data/docs/database/schema.svg +60 -48
  49. data/docs/development/index.md +14 -1
  50. data/docs/development/rake-tasks.md +1068 -0
  51. data/docs/getting-started/index.md +1 -1
  52. data/docs/getting-started/quick-start.md +144 -155
  53. data/docs/guides/adding-memories.md +2 -3
  54. data/docs/guides/context-assembly.md +185 -184
  55. data/docs/guides/getting-started.md +154 -148
  56. data/docs/guides/index.md +8 -1
  57. data/docs/guides/long-term-memory.md +60 -92
  58. data/docs/guides/mcp-server.md +617 -0
  59. data/docs/guides/multi-robot.md +249 -345
  60. data/docs/guides/recalling-memories.md +153 -163
  61. data/docs/guides/robot-groups.md +604 -0
  62. data/docs/guides/search-strategies.md +61 -58
  63. data/docs/guides/working-memory.md +103 -136
  64. data/docs/images/telemetry-architecture.svg +153 -0
  65. data/docs/index.md +30 -26
  66. data/docs/telemetry.md +391 -0
  67. data/examples/README.md +46 -1
  68. data/examples/cli_app/README.md +1 -1
  69. data/examples/cli_app/htm_cli.rb +1 -1
  70. data/examples/robot_groups/robot_worker.rb +1 -2
  71. data/examples/robot_groups/same_process.rb +1 -4
  72. data/examples/sinatra_app/app.rb +1 -1
  73. data/examples/telemetry/README.md +147 -0
  74. data/examples/telemetry/SETUP_README.md +169 -0
  75. data/examples/telemetry/demo.rb +498 -0
  76. data/examples/telemetry/grafana/dashboards/htm-metrics.json +457 -0
  77. data/lib/htm/configuration.rb +261 -70
  78. data/lib/htm/database.rb +46 -22
  79. data/lib/htm/embedding_service.rb +24 -14
  80. data/lib/htm/errors.rb +15 -1
  81. data/lib/htm/jobs/generate_embedding_job.rb +19 -0
  82. data/lib/htm/jobs/generate_propositions_job.rb +103 -0
  83. data/lib/htm/jobs/generate_tags_job.rb +24 -0
  84. data/lib/htm/loaders/markdown_chunker.rb +79 -0
  85. data/lib/htm/loaders/markdown_loader.rb +41 -15
  86. data/lib/htm/long_term_memory/fulltext_search.rb +138 -0
  87. data/lib/htm/long_term_memory/hybrid_search.rb +324 -0
  88. data/lib/htm/long_term_memory/node_operations.rb +209 -0
  89. data/lib/htm/long_term_memory/relevance_scorer.rb +355 -0
  90. data/lib/htm/long_term_memory/robot_operations.rb +34 -0
  91. data/lib/htm/long_term_memory/tag_operations.rb +428 -0
  92. data/lib/htm/long_term_memory/vector_search.rb +109 -0
  93. data/lib/htm/long_term_memory.rb +51 -1153
  94. data/lib/htm/models/node.rb +35 -2
  95. data/lib/htm/models/node_tag.rb +31 -0
  96. data/lib/htm/models/robot_node.rb +31 -0
  97. data/lib/htm/models/tag.rb +44 -0
  98. data/lib/htm/proposition_service.rb +169 -0
  99. data/lib/htm/query_cache.rb +214 -0
  100. data/lib/htm/robot_group.rb +721 -0
  101. data/lib/htm/sql_builder.rb +178 -0
  102. data/lib/htm/tag_service.rb +16 -6
  103. data/lib/htm/tasks.rb +8 -2
  104. data/lib/htm/telemetry.rb +224 -0
  105. data/lib/htm/version.rb +1 -1
  106. data/lib/htm/working_memory_channel.rb +250 -0
  107. data/lib/htm.rb +66 -3
  108. data/lib/tasks/doc.rake +1 -1
  109. data/lib/tasks/htm.rake +259 -13
  110. data/mkdocs.yml +98 -96
  111. metadata +55 -20
  112. data/.aigcm_msg +0 -1
  113. data/.claude/settings.local.json +0 -95
  114. data/CLAUDE.md +0 -603
  115. data/db/migrate/00009_add_working_memory_to_robot_nodes.rb +0 -12
  116. data/examples/cli_app/temp.log +0 -93
  117. data/examples/robot_groups/lib/robot_group.rb +0 -419
  118. data/examples/robot_groups/lib/working_memory_channel.rb +0 -140
  119. data/lib/htm/loaders/paragraph_chunker.rb +0 -112
  120. data/notes/ARCHITECTURE_REVIEW.md +0 -1167
  121. data/notes/IMPLEMENTATION_SUMMARY.md +0 -606
  122. data/notes/MULTI_FRAMEWORK_IMPLEMENTATION.md +0 -451
  123. data/notes/next_steps.md +0 -100
  124. data/notes/plan.md +0 -627
  125. data/notes/tag_ontology_enhancement_ideas.md +0 -222
  126. data/notes/timescaledb_removal_summary.md +0 -200
data/lib/htm/errors.rb CHANGED
@@ -1,6 +1,6 @@
1
1
  # frozen_string_literal: true
2
2
 
3
- # HTM (Hierarchical Temporary Memory) error classes
3
+ # HTM (Hierarchical Temporal Memory) error classes
4
4
  #
5
5
  # All HTM errors inherit from HTM::Error, allowing you to catch
6
6
  # all HTM-related errors with a single rescue clause.
@@ -93,6 +93,20 @@ class HTM
93
93
  #
94
94
  class TagError < Error; end
95
95
 
96
+ # Raised when proposition extraction fails
97
+ #
98
+ # Common causes:
99
+ # - LLM provider API errors
100
+ # - Invalid proposition response format
101
+ # - Network connectivity issues
102
+ # - Model not available
103
+ #
104
+ # Note: This error is distinct from CircuitBreakerOpenError.
105
+ # PropositionError indicates a single failure, while CircuitBreakerOpenError
106
+ # indicates repeated failures have triggered protective circuit breaking.
107
+ #
108
+ class PropositionError < Error; end
109
+
96
110
  # Raised when database operations fail
97
111
  #
98
112
  # Common causes:
@@ -36,6 +36,9 @@ class HTM
36
36
  return
37
37
  end
38
38
 
39
+ provider = HTM.configuration.embedding_provider.to_s
40
+ start_time = Process.clock_gettime(Process::CLOCK_MONOTONIC)
41
+
39
42
  begin
40
43
  HTM.logger.debug "GenerateEmbeddingJob: Generating embedding for node #{node_id}"
41
44
 
@@ -45,17 +48,33 @@ class HTM
45
48
  # Update node with processed embedding
46
49
  node.update!(embedding: result[:storage_embedding])
47
50
 
51
+ # Record success metrics
52
+ elapsed_ms = ((Process.clock_gettime(Process::CLOCK_MONOTONIC) - start_time) * 1000).round
53
+ HTM::Telemetry.embedding_latency.record(elapsed_ms, attributes: { 'provider' => provider, 'status' => 'success' })
54
+ HTM::Telemetry.job_counter.add(1, attributes: { 'job' => 'embedding', 'status' => 'success' })
55
+
48
56
  HTM.logger.info "GenerateEmbeddingJob: Successfully generated embedding for node #{node_id} (#{result[:dimension]} dimensions)"
49
57
 
50
58
  rescue HTM::CircuitBreakerOpenError => e
51
59
  # Circuit breaker is open - service is unavailable, will retry later
60
+ HTM::Telemetry.job_counter.add(1, attributes: { 'job' => 'embedding', 'status' => 'circuit_open' })
52
61
  HTM.logger.warn "GenerateEmbeddingJob: Circuit breaker open for node #{node_id}, will retry when service recovers"
53
62
 
54
63
  rescue HTM::EmbeddingError => e
64
+ # Record failure metrics
65
+ elapsed_ms = ((Process.clock_gettime(Process::CLOCK_MONOTONIC) - start_time) * 1000).round
66
+ HTM::Telemetry.embedding_latency.record(elapsed_ms, attributes: { 'provider' => provider, 'status' => 'error' })
67
+ HTM::Telemetry.job_counter.add(1, attributes: { 'job' => 'embedding', 'status' => 'error' })
68
+
55
69
  # Log embedding-specific errors
56
70
  HTM.logger.error "GenerateEmbeddingJob: Embedding generation failed for node #{node_id}: #{e.message}"
57
71
 
58
72
  rescue StandardError => e
73
+ # Record failure metrics
74
+ elapsed_ms = ((Process.clock_gettime(Process::CLOCK_MONOTONIC) - start_time) * 1000).round
75
+ HTM::Telemetry.embedding_latency.record(elapsed_ms, attributes: { 'provider' => provider, 'status' => 'error' })
76
+ HTM::Telemetry.job_counter.add(1, attributes: { 'job' => 'embedding', 'status' => 'error' })
77
+
59
78
  # Log unexpected errors
60
79
  HTM.logger.error "GenerateEmbeddingJob: Unexpected error for node #{node_id}: #{e.class.name} - #{e.message}"
61
80
  HTM.logger.debug e.backtrace.first(5).join("\n")
@@ -0,0 +1,103 @@
1
+ # frozen_string_literal: true
2
+
3
+ require_relative '../errors'
4
+ require_relative '../models/node'
5
+ require_relative '../proposition_service'
6
+
7
+ class HTM
8
+ module Jobs
9
+ # Background job to extract propositions from nodes and create new nodes
10
+ #
11
+ # This job is enqueued after a node is saved (if proposition extraction is enabled).
12
+ # It uses LLM to extract atomic factual propositions from node content and
13
+ # creates new nodes for each proposition. Proposition nodes are marked with
14
+ # metadata to prevent recursive extraction.
15
+ #
16
+ # @see PropositionService
17
+ #
18
+ class GeneratePropositionsJob
19
+ # Generate propositions for a node
20
+ #
21
+ # Uses the configured proposition extractor (HTM.extract_propositions) which
22
+ # delegates to the application-provided or default RubyLLM implementation.
23
+ #
24
+ # @param node_id [Integer] ID of the node to process
25
+ # @param robot_id [Integer] ID of the robot that owns this node
26
+ #
27
+ def self.perform(node_id:, robot_id:)
28
+ node = HTM::Models::Node.find_by(id: node_id)
29
+
30
+ unless node
31
+ HTM.logger.warn "GeneratePropositionsJob: Node #{node_id} not found"
32
+ return
33
+ end
34
+
35
+ # Skip if this node is already a proposition (prevent recursion)
36
+ if node.metadata&.dig('is_proposition')
37
+ HTM.logger.debug "GeneratePropositionsJob: Node #{node_id} is a proposition, skipping"
38
+ return
39
+ end
40
+
41
+ begin
42
+ HTM.logger.debug "GeneratePropositionsJob: Extracting propositions for node #{node_id}"
43
+
44
+ # Extract propositions using PropositionService
45
+ propositions = HTM::PropositionService.extract(node.content)
46
+
47
+ if propositions.empty?
48
+ HTM.logger.debug "GeneratePropositionsJob: No propositions extracted for node #{node_id}"
49
+ return
50
+ end
51
+
52
+ HTM.logger.info "GeneratePropositionsJob: Extracted #{propositions.length} propositions for node #{node_id}"
53
+
54
+ # Create a node for each proposition
55
+ created_count = 0
56
+ propositions.each do |proposition_text|
57
+ # Calculate token count
58
+ token_count = HTM.count_tokens(proposition_text)
59
+
60
+ # Create proposition node with is_proposition marker
61
+ proposition_node = HTM::Models::Node.create!(
62
+ content: proposition_text,
63
+ token_count: token_count,
64
+ metadata: { is_proposition: true, source_node_id: node_id }
65
+ )
66
+
67
+ # Link to robot via RobotNode
68
+ HTM::Models::RobotNode.find_or_create_by!(
69
+ robot_id: robot_id,
70
+ node_id: proposition_node.id
71
+ )
72
+
73
+ # Enqueue embedding and tag jobs for the new proposition node
74
+ # (but NOT another propositions job - the is_proposition marker prevents that)
75
+ HTM::JobAdapter.enqueue(HTM::Jobs::GenerateEmbeddingJob, node_id: proposition_node.id)
76
+ HTM::JobAdapter.enqueue(HTM::Jobs::GenerateTagsJob, node_id: proposition_node.id)
77
+
78
+ created_count += 1
79
+ end
80
+
81
+ HTM.logger.info "GeneratePropositionsJob: Created #{created_count} proposition nodes from node #{node_id}"
82
+
83
+ rescue HTM::CircuitBreakerOpenError
84
+ # Circuit breaker is open - service is unavailable, will retry later
85
+ HTM.logger.warn "GeneratePropositionsJob: Circuit breaker open for node #{node_id}, will retry when service recovers"
86
+
87
+ rescue HTM::PropositionError => e
88
+ # Log proposition-specific errors
89
+ HTM.logger.error "GeneratePropositionsJob: Proposition extraction failed for node #{node_id}: #{e.message}"
90
+
91
+ rescue ActiveRecord::RecordInvalid => e
92
+ # Log validation errors
93
+ HTM.logger.error "GeneratePropositionsJob: Database validation failed for node #{node_id}: #{e.message}"
94
+
95
+ rescue StandardError => e
96
+ # Log unexpected errors
97
+ HTM.logger.error "GeneratePropositionsJob: Unexpected error for node #{node_id}: #{e.class.name} - #{e.message}"
98
+ HTM.logger.debug e.backtrace.first(5).join("\n")
99
+ end
100
+ end
101
+ end
102
+ end
103
+ end
@@ -33,6 +33,9 @@ class HTM
33
33
  return
34
34
  end
35
35
 
36
+ provider = HTM.configuration.tag_provider.to_s
37
+ start_time = Process.clock_gettime(Process::CLOCK_MONOTONIC)
38
+
36
39
  begin
37
40
  HTM.logger.debug "GenerateTagsJob: Extracting tags for node #{node_id}"
38
41
 
@@ -61,21 +64,42 @@ class HTM
61
64
  )
62
65
  end
63
66
 
67
+ # Record success metrics
68
+ elapsed_ms = ((Process.clock_gettime(Process::CLOCK_MONOTONIC) - start_time) * 1000).round
69
+ HTM::Telemetry.tag_latency.record(elapsed_ms, attributes: { 'provider' => provider, 'status' => 'success' })
70
+ HTM::Telemetry.job_counter.add(1, attributes: { 'job' => 'tags', 'status' => 'success' })
71
+
64
72
  HTM.logger.info "GenerateTagsJob: Successfully generated #{tag_names.length} tags for node #{node_id}: #{tag_names.join(', ')}"
65
73
 
66
74
  rescue HTM::CircuitBreakerOpenError => e
67
75
  # Circuit breaker is open - service is unavailable, will retry later
76
+ HTM::Telemetry.job_counter.add(1, attributes: { 'job' => 'tags', 'status' => 'circuit_open' })
68
77
  HTM.logger.warn "GenerateTagsJob: Circuit breaker open for node #{node_id}, will retry when service recovers"
69
78
 
70
79
  rescue HTM::TagError => e
80
+ # Record failure metrics
81
+ elapsed_ms = ((Process.clock_gettime(Process::CLOCK_MONOTONIC) - start_time) * 1000).round
82
+ HTM::Telemetry.tag_latency.record(elapsed_ms, attributes: { 'provider' => provider, 'status' => 'error' })
83
+ HTM::Telemetry.job_counter.add(1, attributes: { 'job' => 'tags', 'status' => 'error' })
84
+
71
85
  # Log tag-specific errors
72
86
  HTM.logger.error "GenerateTagsJob: Tag generation failed for node #{node_id}: #{e.message}"
73
87
 
74
88
  rescue ActiveRecord::RecordInvalid => e
89
+ # Record failure metrics
90
+ elapsed_ms = ((Process.clock_gettime(Process::CLOCK_MONOTONIC) - start_time) * 1000).round
91
+ HTM::Telemetry.tag_latency.record(elapsed_ms, attributes: { 'provider' => provider, 'status' => 'error' })
92
+ HTM::Telemetry.job_counter.add(1, attributes: { 'job' => 'tags', 'status' => 'error' })
93
+
75
94
  # Log validation errors
76
95
  HTM.logger.error "GenerateTagsJob: Database validation failed for node #{node_id}: #{e.message}"
77
96
 
78
97
  rescue StandardError => e
98
+ # Record failure metrics
99
+ elapsed_ms = ((Process.clock_gettime(Process::CLOCK_MONOTONIC) - start_time) * 1000).round
100
+ HTM::Telemetry.tag_latency.record(elapsed_ms, attributes: { 'provider' => provider, 'status' => 'error' })
101
+ HTM::Telemetry.job_counter.add(1, attributes: { 'job' => 'tags', 'status' => 'error' })
102
+
79
103
  # Log unexpected errors
80
104
  HTM.logger.error "GenerateTagsJob: Unexpected error for node #{node_id}: #{e.class.name} - #{e.message}"
81
105
  HTM.logger.debug e.backtrace.first(5).join("\n")
@@ -0,0 +1,79 @@
1
+ # frozen_string_literal: true
2
+
3
+ require 'baran'
4
+
5
+ class HTM
6
+ module Loaders
7
+ # Markdown-aware text chunker using Baran
8
+ #
9
+ # Wraps Baran::MarkdownSplitter to provide intelligent text chunking
10
+ # that respects markdown structure (headers, code blocks, etc.).
11
+ #
12
+ # @example Basic usage
13
+ # chunker = MarkdownChunker.new
14
+ # chunks = chunker.chunk("# Header\n\nParagraph text.\n\n## Subheader\n\nMore text.")
15
+ # # => ["# Header\n\nParagraph text.", "## Subheader\n\nMore text."]
16
+ #
17
+ # @example With custom chunk size
18
+ # chunker = MarkdownChunker.new(chunk_size: 512, chunk_overlap: 50)
19
+ # chunks = chunker.chunk(long_text)
20
+ #
21
+ # @example With full metadata (includes cursor positions)
22
+ # chunker = MarkdownChunker.new
23
+ # chunks = chunker.chunk_with_metadata(text)
24
+ # # => [{ text: "...", cursor: 0, metadata: nil }, { text: "...", cursor: 156, metadata: nil }]
25
+ #
26
+ class MarkdownChunker
27
+ # @param chunk_size [Integer] Maximum characters per chunk (default: from config or 1024)
28
+ # @param chunk_overlap [Integer] Character overlap between chunks (default: from config or 64)
29
+ def initialize(chunk_size: nil, chunk_overlap: nil)
30
+ @chunk_size = chunk_size || HTM.configuration.chunk_size
31
+ @chunk_overlap = chunk_overlap || HTM.configuration.chunk_overlap
32
+
33
+ @splitter = Baran::MarkdownSplitter.new(
34
+ chunk_size: @chunk_size,
35
+ chunk_overlap: @chunk_overlap
36
+ )
37
+ end
38
+
39
+ # Split text into markdown-aware chunks (text only)
40
+ #
41
+ # @param text [String] Text to chunk
42
+ # @return [Array<String>] Array of text chunks
43
+ #
44
+ def chunk(text)
45
+ return [] if text.nil? || text.strip.empty?
46
+
47
+ # Normalize line endings
48
+ normalized = text.gsub(/\r\n?/, "\n")
49
+
50
+ # Use Baran's MarkdownSplitter
51
+ result = @splitter.chunks(normalized)
52
+
53
+ # Extract text from chunk hashes, filter empty
54
+ result.map { |chunk| chunk[:text].strip }.reject(&:empty?)
55
+ end
56
+
57
+ # Split text and return full chunk data (with cursor positions)
58
+ #
59
+ # Returns Baran's full output including:
60
+ # - :text [String] The chunk content
61
+ # - :cursor [Integer] Character offset where chunk starts in original text
62
+ #
63
+ # @param text [String] Text to chunk
64
+ # @return [Array<Hash>] Array of chunk hashes with :text and :cursor
65
+ #
66
+ def chunk_with_metadata(text)
67
+ return [] if text.nil? || text.strip.empty?
68
+
69
+ # Normalize line endings
70
+ normalized = text.gsub(/\r\n?/, "\n")
71
+
72
+ # Use Baran's MarkdownSplitter - returns [{text:, cursor:}, ...]
73
+ @splitter.chunks(normalized)
74
+ end
75
+
76
+ attr_reader :chunk_size, :chunk_overlap
77
+ end
78
+ end
79
+ end
@@ -26,9 +26,14 @@ class HTM
26
26
  MAX_FILE_SIZE = 10 * 1024 * 1024 # 10 MB maximum file size
27
27
 
28
28
  # @param htm_instance [HTM] The HTM instance to use for storing nodes
29
- def initialize(htm_instance)
29
+ # @param chunk_size [Integer] Maximum characters per chunk (default: from config)
30
+ # @param chunk_overlap [Integer] Character overlap between chunks (default: from config)
31
+ def initialize(htm_instance, chunk_size: nil, chunk_overlap: nil)
30
32
  @htm = htm_instance
31
- @chunker = ParagraphChunker.new
33
+ @chunker = MarkdownChunker.new(
34
+ chunk_size: chunk_size,
35
+ chunk_overlap: chunk_overlap
36
+ )
32
37
  end
33
38
 
34
39
  # Load a single markdown file into long-term memory
@@ -89,19 +94,19 @@ class HTM
89
94
  # Parse frontmatter and body
90
95
  frontmatter, body = extract_frontmatter(content)
91
96
 
92
- # Chunk the body
93
- chunks = @chunker.chunk(body)
97
+ # Chunk the body with metadata (includes cursor positions)
98
+ chunks = @chunker.chunk_with_metadata(body)
94
99
 
95
100
  # Prepend frontmatter to first chunk if present
96
101
  if frontmatter.any? && chunks.any?
97
102
  frontmatter_yaml = YAML.dump(frontmatter).sub(/\A---\n/, "---\n")
98
- chunks[0] = "#{frontmatter_yaml}---\n\n#{chunks[0]}"
103
+ chunks[0][:text] = "#{frontmatter_yaml}---\n\n#{chunks[0][:text]}"
99
104
  end
100
105
 
101
106
  # Save source first (need ID for node association)
102
107
  source.save! if source.new_record?
103
108
 
104
- # Sync chunks to database
109
+ # Sync chunks to database (chunks now include cursor positions)
105
110
  result = sync_chunks(source, chunks)
106
111
 
107
112
  # Update source record
@@ -181,7 +186,7 @@ class HTM
181
186
  # Sync chunks to database, handling updates and deletions
182
187
  #
183
188
  # @param source [FileSource] The source record
184
- # @param chunks [Array<String>] New chunk contents
189
+ # @param chunks [Array<Hash>] New chunks with :text and :cursor keys
185
190
  # @return [Hash] Sync statistics
186
191
  #
187
192
  def sync_chunks(source, chunks)
@@ -197,12 +202,16 @@ class HTM
197
202
  # Track which existing nodes we've matched
198
203
  matched_hashes = Set.new
199
204
 
200
- # Process each new chunk
201
- chunks.each_with_index do |chunk_content, position|
205
+ # Process each new chunk (chunks are now Hashes with :text and :cursor)
206
+ chunks.each_with_index do |chunk_data, position|
207
+ chunk_content = chunk_data[:text].strip
208
+ chunk_cursor = chunk_data[:cursor]
209
+ next if chunk_content.empty?
210
+
202
211
  chunk_hash = HTM::Models::Node.generate_content_hash(chunk_content)
203
212
 
204
213
  if existing_by_hash[chunk_hash]
205
- # Chunk exists - update position if needed, restore if soft-deleted
214
+ # Chunk exists - update position/cursor if needed, restore if soft-deleted
206
215
  node = existing_by_hash[chunk_hash]
207
216
  matched_hashes << chunk_hash
208
217
 
@@ -210,13 +219,20 @@ class HTM
210
219
  changes[:chunk_position] = position if node.chunk_position != position
211
220
  changes[:deleted_at] = nil if node.deleted_at.present?
212
221
 
222
+ # Update cursor in metadata if changed
223
+ current_cursor = node.metadata&.dig('cursor')
224
+ if current_cursor != chunk_cursor
225
+ new_metadata = (node.metadata || {}).merge('cursor' => chunk_cursor)
226
+ changes[:metadata] = new_metadata
227
+ end
228
+
213
229
  if changes.any?
214
230
  node.update!(changes)
215
231
  updated += 1
216
232
  end
217
233
  else
218
- # New chunk - create node
219
- node = create_chunk_node(source, chunk_content, position)
234
+ # New chunk - create node with cursor in metadata
235
+ node = create_chunk_node(source, chunk_content, position, cursor: chunk_cursor)
220
236
  created += 1 if node
221
237
  end
222
238
  end
@@ -238,11 +254,15 @@ class HTM
238
254
  # @param source [FileSource] The source record
239
255
  # @param content [String] Chunk content
240
256
  # @param position [Integer] Position in file (0-indexed)
257
+ # @param cursor [Integer] Character offset in original file
241
258
  # @return [Node, nil] The created node or nil if duplicate
242
259
  #
243
- def create_chunk_node(source, content, position)
260
+ def create_chunk_node(source, content, position, cursor: nil)
261
+ # Build metadata with cursor position (file path is in source, not duplicated here)
262
+ chunk_metadata = cursor ? { 'cursor' => cursor } : {}
263
+
244
264
  # Use remember to get proper embedding/tag processing
245
- node_id = @htm.remember(content)
265
+ node_id = @htm.remember(content, metadata: chunk_metadata)
246
266
 
247
267
  # Update with source reference
248
268
  node = HTM::Models::Node.find(node_id)
@@ -254,7 +274,13 @@ class HTM
254
274
  # Find and link to this source
255
275
  existing = HTM::Models::Node.find_by_content(content)
256
276
  if existing && existing.source_id.nil?
257
- existing.update!(source_id: source.id, chunk_position: position)
277
+ # Merge cursor into existing metadata
278
+ new_metadata = (existing.metadata || {}).merge('cursor' => cursor) if cursor
279
+ existing.update!(
280
+ source_id: source.id,
281
+ chunk_position: position,
282
+ metadata: new_metadata || existing.metadata
283
+ )
258
284
  end
259
285
  existing
260
286
  end
@@ -0,0 +1,138 @@
1
+ # frozen_string_literal: true
2
+
3
+ class HTM
4
+ class LongTermMemory
5
+ # Full-text search using PostgreSQL tsvector and pg_trgm
6
+ #
7
+ # Performs keyword-based search using:
8
+ # - PostgreSQL full-text search (tsvector/tsquery) for stemmed word matching
9
+ # - Trigram fuzzy matching (pg_trgm) for typos and partial words
10
+ # - Combined scoring: tsvector matches rank higher, trigram provides fallback
11
+ #
12
+ # Results are cached for performance.
13
+ #
14
+ # Security: All queries use parameterized placeholders to prevent SQL injection.
15
+ #
16
+ module FulltextSearch
17
+ # Maximum results to prevent DoS via unbounded queries
18
+ MAX_FULLTEXT_LIMIT = 1000
19
+
20
+ # Minimum trigram similarity threshold (0.0-1.0)
21
+ # Lower = more fuzzy matches, higher = stricter matching
22
+ TRIGRAM_SIMILARITY_THRESHOLD = 0.1
23
+
24
+ # Score boost for tsvector matches over trigram matches
25
+ # Ensures exact word matches rank above fuzzy matches
26
+ TSVECTOR_SCORE_BOOST = 1.0
27
+
28
+ # Full-text search
29
+ #
30
+ # @param timeframe [Range] Time range to search
31
+ # @param query [String] Search query
32
+ # @param limit [Integer] Maximum results (capped at MAX_FULLTEXT_LIMIT)
33
+ # @param metadata [Hash] Filter by metadata fields (default: {})
34
+ # @return [Array<Hash>] Matching nodes
35
+ #
36
+ def search_fulltext(timeframe:, query:, limit:, metadata: {})
37
+ # Enforce limit to prevent DoS
38
+ safe_limit = [[limit.to_i, 1].max, MAX_FULLTEXT_LIMIT].min
39
+
40
+ start_time = Process.clock_gettime(Process::CLOCK_MONOTONIC)
41
+ result = @cache.fetch(:fulltext, timeframe, query, safe_limit, metadata) do
42
+ search_fulltext_uncached(
43
+ timeframe: timeframe,
44
+ query: query,
45
+ limit: safe_limit,
46
+ metadata: metadata
47
+ )
48
+ end
49
+ elapsed_ms = ((Process.clock_gettime(Process::CLOCK_MONOTONIC) - start_time) * 1000).round
50
+ HTM::Telemetry.search_latency.record(elapsed_ms, attributes: { 'strategy' => 'fulltext' })
51
+ result
52
+ end
53
+
54
+ private
55
+
56
+ # Uncached full-text search combining tsvector and trigram matching
57
+ #
58
+ # Uses UNION to combine:
59
+ # 1. tsvector matches (stemmed words, high priority)
60
+ # 2. trigram matches (fuzzy/partial, lower priority fallback)
61
+ #
62
+ # Deduplicates by taking highest score per node.
63
+ #
64
+ # @param timeframe [nil, Range, Array<Range>] Time range(s) to search (nil = no filter)
65
+ # @param query [String] Search query
66
+ # @param limit [Integer] Maximum results
67
+ # @param metadata [Hash] Filter by metadata fields (default: {})
68
+ # @return [Array<Hash>] Matching nodes
69
+ #
70
+ def search_fulltext_uncached(timeframe:, query:, limit:, metadata: {})
71
+ # Build filter conditions
72
+ timeframe_condition = HTM::SqlBuilder.timeframe_condition(timeframe)
73
+ metadata_condition = HTM::SqlBuilder.metadata_condition(metadata)
74
+
75
+ additional_conditions = []
76
+ additional_conditions << timeframe_condition if timeframe_condition
77
+ additional_conditions << metadata_condition if metadata_condition
78
+ additional_sql = additional_conditions.any? ? "AND #{additional_conditions.join(' AND ')}" : ""
79
+
80
+ # Combined tsvector + trigram search
81
+ # tsvector matches get boosted score, trigram provides fuzzy fallback
82
+ sql = <<~SQL
83
+ WITH tsvector_matches AS (
84
+ -- Primary: tsvector full-text search (stemmed word matching)
85
+ SELECT id, content, access_count, created_at, token_count,
86
+ (? + ts_rank(to_tsvector('english', content), plainto_tsquery('english', ?))) as score,
87
+ 'tsvector' as match_type
88
+ FROM nodes
89
+ WHERE deleted_at IS NULL
90
+ AND to_tsvector('english', content) @@ plainto_tsquery('english', ?)
91
+ #{additional_sql}
92
+ ),
93
+ trigram_matches AS (
94
+ -- Fallback: trigram fuzzy matching (typos, partial words)
95
+ SELECT id, content, access_count, created_at, token_count,
96
+ similarity(content, ?) as score,
97
+ 'trigram' as match_type
98
+ FROM nodes
99
+ WHERE deleted_at IS NULL
100
+ AND similarity(content, ?) >= ?
101
+ AND id NOT IN (SELECT id FROM tsvector_matches)
102
+ #{additional_sql}
103
+ ),
104
+ combined AS (
105
+ SELECT * FROM tsvector_matches
106
+ UNION ALL
107
+ SELECT * FROM trigram_matches
108
+ )
109
+ SELECT id, content, access_count, created_at, token_count,
110
+ MAX(score) as rank, match_type
111
+ FROM combined
112
+ GROUP BY id, content, access_count, created_at, token_count, match_type
113
+ ORDER BY rank DESC
114
+ LIMIT ?
115
+ SQL
116
+
117
+ result = ActiveRecord::Base.connection.select_all(
118
+ ActiveRecord::Base.sanitize_sql_array([
119
+ sql,
120
+ TSVECTOR_SCORE_BOOST, # boost for tsvector
121
+ query, # ts_rank query
122
+ query, # tsvector match query
123
+ query, # trigram similarity query
124
+ query, # trigram match query
125
+ TRIGRAM_SIMILARITY_THRESHOLD,
126
+ limit
127
+ ])
128
+ )
129
+
130
+ # Track access for retrieved nodes
131
+ node_ids = result.map { |r| r['id'] }
132
+ track_access(node_ids)
133
+
134
+ result.to_a
135
+ end
136
+ end
137
+ end
138
+ end