htm 0.0.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (155) hide show
  1. checksums.yaml +7 -0
  2. data/.architecture/decisions/adrs/001-use-postgresql-timescaledb-storage.md +227 -0
  3. data/.architecture/decisions/adrs/002-two-tier-memory-architecture.md +322 -0
  4. data/.architecture/decisions/adrs/003-ollama-default-embedding-provider.md +339 -0
  5. data/.architecture/decisions/adrs/004-multi-robot-shared-memory-hive-mind.md +374 -0
  6. data/.architecture/decisions/adrs/005-rag-based-retrieval-with-hybrid-search.md +443 -0
  7. data/.architecture/decisions/adrs/006-context-assembly-strategies.md +444 -0
  8. data/.architecture/decisions/adrs/007-working-memory-eviction-strategy.md +461 -0
  9. data/.architecture/decisions/adrs/008-robot-identification-system.md +550 -0
  10. data/.architecture/decisions/adrs/009-never-forget-explicit-deletion-only.md +570 -0
  11. data/.architecture/decisions/adrs/010-redis-working-memory-rejected.md +323 -0
  12. data/.architecture/decisions/adrs/011-database-side-embedding-generation-with-pgai.md +585 -0
  13. data/.architecture/decisions/adrs/012-llm-driven-ontology-topic-extraction.md +583 -0
  14. data/.architecture/decisions/adrs/013-activerecord-orm-and-many-to-many-tagging.md +299 -0
  15. data/.architecture/decisions/adrs/014-client-side-embedding-generation-workflow.md +569 -0
  16. data/.architecture/decisions/adrs/015-hierarchical-tag-ontology-and-llm-extraction.md +701 -0
  17. data/.architecture/decisions/adrs/016-async-embedding-and-tag-generation.md +694 -0
  18. data/.architecture/members.yml +144 -0
  19. data/.architecture/reviews/2025-10-29-llm-configuration-and-async-processing-review.md +1137 -0
  20. data/.architecture/reviews/initial-system-analysis.md +330 -0
  21. data/.envrc +32 -0
  22. data/.irbrc +145 -0
  23. data/CHANGELOG.md +150 -0
  24. data/COMMITS.md +196 -0
  25. data/LICENSE +21 -0
  26. data/README.md +1347 -0
  27. data/Rakefile +51 -0
  28. data/SETUP.md +268 -0
  29. data/config/database.yml +67 -0
  30. data/db/migrate/20250101000001_enable_extensions.rb +14 -0
  31. data/db/migrate/20250101000002_create_robots.rb +14 -0
  32. data/db/migrate/20250101000003_create_nodes.rb +42 -0
  33. data/db/migrate/20250101000005_create_tags.rb +38 -0
  34. data/db/migrate/20250101000007_add_node_vector_indexes.rb +30 -0
  35. data/db/schema.sql +473 -0
  36. data/db/seed_data/README.md +100 -0
  37. data/db/seed_data/presidents.md +136 -0
  38. data/db/seed_data/states.md +151 -0
  39. data/db/seeds.rb +208 -0
  40. data/dbdoc/README.md +173 -0
  41. data/dbdoc/public.node_stats.md +48 -0
  42. data/dbdoc/public.node_stats.svg +41 -0
  43. data/dbdoc/public.node_tags.md +40 -0
  44. data/dbdoc/public.node_tags.svg +112 -0
  45. data/dbdoc/public.nodes.md +54 -0
  46. data/dbdoc/public.nodes.svg +118 -0
  47. data/dbdoc/public.nodes_tags.md +39 -0
  48. data/dbdoc/public.nodes_tags.svg +112 -0
  49. data/dbdoc/public.ontology_structure.md +48 -0
  50. data/dbdoc/public.ontology_structure.svg +38 -0
  51. data/dbdoc/public.operations_log.md +42 -0
  52. data/dbdoc/public.operations_log.svg +130 -0
  53. data/dbdoc/public.relationships.md +39 -0
  54. data/dbdoc/public.relationships.svg +41 -0
  55. data/dbdoc/public.robot_activity.md +46 -0
  56. data/dbdoc/public.robot_activity.svg +35 -0
  57. data/dbdoc/public.robots.md +35 -0
  58. data/dbdoc/public.robots.svg +90 -0
  59. data/dbdoc/public.schema_migrations.md +29 -0
  60. data/dbdoc/public.schema_migrations.svg +26 -0
  61. data/dbdoc/public.tags.md +35 -0
  62. data/dbdoc/public.tags.svg +60 -0
  63. data/dbdoc/public.topic_relationships.md +45 -0
  64. data/dbdoc/public.topic_relationships.svg +32 -0
  65. data/dbdoc/schema.json +1437 -0
  66. data/dbdoc/schema.svg +154 -0
  67. data/docs/api/database.md +806 -0
  68. data/docs/api/embedding-service.md +532 -0
  69. data/docs/api/htm.md +797 -0
  70. data/docs/api/index.md +259 -0
  71. data/docs/api/long-term-memory.md +1096 -0
  72. data/docs/api/working-memory.md +665 -0
  73. data/docs/architecture/adrs/001-postgresql-timescaledb.md +314 -0
  74. data/docs/architecture/adrs/002-two-tier-memory.md +411 -0
  75. data/docs/architecture/adrs/003-ollama-embeddings.md +421 -0
  76. data/docs/architecture/adrs/004-hive-mind.md +437 -0
  77. data/docs/architecture/adrs/005-rag-retrieval.md +531 -0
  78. data/docs/architecture/adrs/006-context-assembly.md +496 -0
  79. data/docs/architecture/adrs/007-eviction-strategy.md +645 -0
  80. data/docs/architecture/adrs/008-robot-identification.md +625 -0
  81. data/docs/architecture/adrs/009-never-forget.md +648 -0
  82. data/docs/architecture/adrs/010-redis-working-memory-rejected.md +323 -0
  83. data/docs/architecture/adrs/011-pgai-integration.md +494 -0
  84. data/docs/architecture/adrs/index.md +215 -0
  85. data/docs/architecture/hive-mind.md +736 -0
  86. data/docs/architecture/index.md +351 -0
  87. data/docs/architecture/overview.md +538 -0
  88. data/docs/architecture/two-tier-memory.md +873 -0
  89. data/docs/assets/css/custom.css +83 -0
  90. data/docs/assets/images/htm-core-components.svg +63 -0
  91. data/docs/assets/images/htm-database-schema.svg +93 -0
  92. data/docs/assets/images/htm-hive-mind-architecture.svg +125 -0
  93. data/docs/assets/images/htm-importance-scoring-framework.svg +83 -0
  94. data/docs/assets/images/htm-layered-architecture.svg +71 -0
  95. data/docs/assets/images/htm-long-term-memory-architecture.svg +115 -0
  96. data/docs/assets/images/htm-working-memory-architecture.svg +120 -0
  97. data/docs/assets/images/htm.jpg +0 -0
  98. data/docs/assets/images/htm_demo.gif +0 -0
  99. data/docs/assets/js/mathjax.js +18 -0
  100. data/docs/assets/videos/htm_video.mp4 +0 -0
  101. data/docs/database_rake_tasks.md +322 -0
  102. data/docs/development/contributing.md +787 -0
  103. data/docs/development/index.md +336 -0
  104. data/docs/development/schema.md +596 -0
  105. data/docs/development/setup.md +719 -0
  106. data/docs/development/testing.md +819 -0
  107. data/docs/guides/adding-memories.md +824 -0
  108. data/docs/guides/context-assembly.md +1009 -0
  109. data/docs/guides/getting-started.md +577 -0
  110. data/docs/guides/index.md +118 -0
  111. data/docs/guides/long-term-memory.md +941 -0
  112. data/docs/guides/multi-robot.md +866 -0
  113. data/docs/guides/recalling-memories.md +927 -0
  114. data/docs/guides/search-strategies.md +953 -0
  115. data/docs/guides/working-memory.md +717 -0
  116. data/docs/index.md +214 -0
  117. data/docs/installation.md +477 -0
  118. data/docs/multi_framework_support.md +519 -0
  119. data/docs/quick-start.md +655 -0
  120. data/docs/setup_local_database.md +302 -0
  121. data/docs/using_rake_tasks_in_your_app.md +383 -0
  122. data/examples/basic_usage.rb +93 -0
  123. data/examples/cli_app/README.md +317 -0
  124. data/examples/cli_app/htm_cli.rb +270 -0
  125. data/examples/custom_llm_configuration.rb +183 -0
  126. data/examples/example_app/Rakefile +71 -0
  127. data/examples/example_app/app.rb +206 -0
  128. data/examples/sinatra_app/Gemfile +21 -0
  129. data/examples/sinatra_app/app.rb +335 -0
  130. data/lib/htm/active_record_config.rb +113 -0
  131. data/lib/htm/configuration.rb +342 -0
  132. data/lib/htm/database.rb +594 -0
  133. data/lib/htm/embedding_service.rb +115 -0
  134. data/lib/htm/errors.rb +34 -0
  135. data/lib/htm/job_adapter.rb +154 -0
  136. data/lib/htm/jobs/generate_embedding_job.rb +65 -0
  137. data/lib/htm/jobs/generate_tags_job.rb +82 -0
  138. data/lib/htm/long_term_memory.rb +965 -0
  139. data/lib/htm/models/node.rb +109 -0
  140. data/lib/htm/models/node_tag.rb +33 -0
  141. data/lib/htm/models/robot.rb +52 -0
  142. data/lib/htm/models/tag.rb +76 -0
  143. data/lib/htm/railtie.rb +76 -0
  144. data/lib/htm/sinatra.rb +157 -0
  145. data/lib/htm/tag_service.rb +135 -0
  146. data/lib/htm/tasks.rb +38 -0
  147. data/lib/htm/version.rb +5 -0
  148. data/lib/htm/working_memory.rb +182 -0
  149. data/lib/htm.rb +400 -0
  150. data/lib/tasks/db.rake +19 -0
  151. data/lib/tasks/htm.rake +147 -0
  152. data/lib/tasks/jobs.rake +312 -0
  153. data/mkdocs.yml +190 -0
  154. data/scripts/install_local_database.sh +309 -0
  155. metadata +341 -0
@@ -0,0 +1,594 @@
1
+ # frozen_string_literal: true
2
+
3
+ require 'pg'
4
+ require 'uri'
5
+ require 'set'
6
+
7
+ class HTM
8
+ # Database setup and configuration for HTM
9
+ # Handles schema creation and database initialization
10
+ class Database
11
+ class << self
12
+ # Set up the HTM database schema
13
+ #
14
+ # @param db_url [String] Database connection URL (uses ENV['HTM_DBURL'] if not provided)
15
+ # @param run_migrations [Boolean] Whether to run migrations (default: true)
16
+ # @param dump_schema [Boolean] Whether to dump schema to db/schema.sql after setup (default: false)
17
+ # @return [void]
18
+ #
19
+ def setup(db_url = nil, run_migrations: true, dump_schema: false)
20
+ require 'active_record'
21
+ require_relative 'active_record_config'
22
+
23
+ # Establish ActiveRecord connection
24
+ HTM::ActiveRecordConfig.establish_connection!
25
+
26
+ # Run migrations using ActiveRecord
27
+ if run_migrations
28
+ puts "Running ActiveRecord migrations..."
29
+ run_activerecord_migrations
30
+ end
31
+
32
+ puts "✓ HTM database schema created successfully"
33
+
34
+ # Optionally dump schema
35
+ if dump_schema
36
+ puts ""
37
+ self.dump_schema(db_url)
38
+ end
39
+ end
40
+
41
+ # Run pending database migrations
42
+ #
43
+ # @param db_url [String] Database connection URL (uses ENV['HTM_DBURL'] if not provided)
44
+ # @return [void]
45
+ #
46
+ def migrate(db_url = nil)
47
+ require 'active_record'
48
+ require_relative 'active_record_config'
49
+
50
+ # Establish ActiveRecord connection
51
+ HTM::ActiveRecordConfig.establish_connection!
52
+
53
+ run_activerecord_migrations
54
+
55
+ puts "✓ Database migrations completed"
56
+ end
57
+
58
+ # Show migration status
59
+ #
60
+ # @param db_url [String] Database connection URL (uses ENV['HTM_DBURL'] if not provided)
61
+ # @return [void]
62
+ #
63
+ def migration_status(db_url = nil)
64
+ require 'active_record'
65
+ require_relative 'active_record_config'
66
+
67
+ # Establish ActiveRecord connection
68
+ HTM::ActiveRecordConfig.establish_connection!
69
+
70
+ migrations_path = File.expand_path('../../db/migrate', __dir__)
71
+
72
+ # Get available migrations from files
73
+ available_migrations = Dir.glob(File.join(migrations_path, '*.rb')).map do |file|
74
+ {
75
+ version: File.basename(file).split('_').first,
76
+ name: File.basename(file, '.rb')
77
+ }
78
+ end.sort_by { |m| m[:version] }
79
+
80
+ # Get applied migrations from database
81
+ applied_versions = begin
82
+ ActiveRecord::Base.connection.select_values('SELECT version FROM schema_migrations ORDER BY version')
83
+ rescue ActiveRecord::StatementInvalid
84
+ []
85
+ end
86
+
87
+ puts "\nMigration Status"
88
+ puts "=" * 100
89
+
90
+ if available_migrations.empty?
91
+ puts "No migration files found in db/migrate/"
92
+ else
93
+ available_migrations.each do |migration|
94
+ status = applied_versions.include?(migration[:version])
95
+ status_mark = status ? "✓" : "✗"
96
+
97
+ puts "#{status_mark} #{migration[:name]}"
98
+ end
99
+ end
100
+
101
+ applied_count = applied_versions.length
102
+ pending_count = available_migrations.length - applied_count
103
+
104
+ puts "\nSummary: #{applied_count} applied, #{pending_count} pending"
105
+ puts "=" * 100
106
+ end
107
+
108
+ # Drop all HTM tables
109
+ #
110
+ # @param db_url [String] Database connection URL (uses ENV['HTM_DBURL'] if not provided)
111
+ # @return [void]
112
+ #
113
+ def drop(db_url = nil)
114
+ config = parse_connection_url(db_url || ENV['HTM_DBURL'])
115
+ raise "Database configuration not found" unless config
116
+
117
+ conn = PG.connect(config)
118
+
119
+ tables = ['nodes', 'node_tags', 'tags', 'robots', 'operations_log', 'schema_migrations']
120
+
121
+ puts "Dropping HTM tables..."
122
+ tables.each do |table|
123
+ begin
124
+ conn.exec("DROP TABLE IF EXISTS #{table} CASCADE")
125
+ puts " ✓ Dropped #{table}"
126
+ rescue PG::Error => e
127
+ puts " ✗ Error dropping #{table}: #{e.message}"
128
+ end
129
+ end
130
+
131
+ # Drop functions and triggers
132
+ begin
133
+ conn.exec("DROP FUNCTION IF EXISTS extract_ontology_topics() CASCADE")
134
+ puts " ✓ Dropped ontology functions and triggers"
135
+ rescue PG::Error => e
136
+ puts " ✗ Error dropping functions: #{e.message}"
137
+ end
138
+
139
+ # Drop views
140
+ begin
141
+ conn.exec("DROP VIEW IF EXISTS ontology_structure CASCADE")
142
+ conn.exec("DROP VIEW IF EXISTS topic_relationships CASCADE")
143
+ puts " ✓ Dropped ontology views"
144
+ rescue PG::Error => e
145
+ puts " ✗ Error dropping views: #{e.message}"
146
+ end
147
+
148
+ conn.close
149
+ puts "✓ All HTM tables dropped"
150
+ end
151
+
152
+ # Seed database with sample data
153
+ #
154
+ # Loads and executes db/seeds.rb file following Rails conventions.
155
+ # All seeding logic is contained in db/seeds.rb and reads data
156
+ # from markdown files in db/seed_data/ directory.
157
+ #
158
+ # @param db_url [String] Database connection URL (uses ENV['HTM_DBURL'] if not provided)
159
+ # @return [void]
160
+ #
161
+ def seed(db_url = nil)
162
+ seeds_file = File.expand_path('../../db/seeds.rb', __dir__)
163
+
164
+ unless File.exist?(seeds_file)
165
+ puts "✗ Error: Seeds file not found at #{seeds_file}"
166
+ puts " Please create db/seeds.rb with your seeding logic"
167
+ exit 1
168
+ end
169
+
170
+ # Load and execute seeds.rb
171
+ load seeds_file
172
+ end
173
+
174
+ # Dump current database schema to db/schema.sql
175
+ #
176
+ # Uses pg_dump to create a clean SQL schema file without data
177
+ #
178
+ # @param db_url [String] Database connection URL (uses ENV['HTM_DBURL'] if not provided)
179
+ # @return [void]
180
+ #
181
+ def dump_schema(db_url = nil)
182
+ config = parse_connection_url(db_url || ENV['HTM_DBURL'])
183
+ raise "Database configuration not found" unless config
184
+
185
+ schema_file = File.expand_path('../../db/schema.sql', __dir__)
186
+
187
+ puts "Dumping schema to #{schema_file}..."
188
+
189
+ # Build pg_dump command
190
+ # --schema-only: only dump schema, not data
191
+ # --no-owner: don't set ownership
192
+ # --no-privileges: don't dump access privileges
193
+ # --no-tablespaces: don't dump tablespace assignments
194
+ # --exclude-schema=_timescaledb_*: exclude TimescaleDB internal schemas
195
+ env = {
196
+ 'PGPASSWORD' => config[:password]
197
+ }
198
+
199
+ cmd = [
200
+ 'pg_dump',
201
+ '--schema-only',
202
+ '--no-owner',
203
+ '--no-privileges',
204
+ '--no-tablespaces',
205
+ '--exclude-schema=_timescaledb_*',
206
+ '--exclude-schema=information_schema',
207
+ '--exclude-schema=pg_catalog',
208
+ '-h', config[:host],
209
+ '-p', config[:port].to_s,
210
+ '-U', config[:user],
211
+ '-d', config[:dbname]
212
+ ]
213
+
214
+ # Execute pg_dump and capture output
215
+ require 'open3'
216
+ stdout, stderr, status = Open3.capture3(env, *cmd)
217
+
218
+ unless status.success?
219
+ puts "✗ Error dumping schema:"
220
+ puts stderr
221
+ exit 1
222
+ end
223
+
224
+ # Clean up the output
225
+ cleaned_schema = clean_schema_dump(stdout)
226
+
227
+ # Write to file
228
+ File.write(schema_file, cleaned_schema)
229
+
230
+ puts "✓ Schema dumped successfully to #{schema_file}"
231
+ puts " Size: #{File.size(schema_file)} bytes"
232
+ end
233
+
234
+ # Load schema from db/schema.sql
235
+ #
236
+ # Uses psql to load the schema file
237
+ #
238
+ # @param db_url [String] Database connection URL (uses ENV['HTM_DBURL'] if not provided)
239
+ # @return [void]
240
+ #
241
+ def load_schema(db_url = nil)
242
+ config = parse_connection_url(db_url || ENV['HTM_DBURL'])
243
+ raise "Database configuration not found" unless config
244
+
245
+ schema_file = File.expand_path('../../db/schema.sql', __dir__)
246
+
247
+ unless File.exist?(schema_file)
248
+ puts "✗ Schema file not found: #{schema_file}"
249
+ puts " Run 'rake htm:db:schema:dump' first to create it"
250
+ exit 1
251
+ end
252
+
253
+ puts "Loading schema from #{schema_file}..."
254
+
255
+ # Build psql command
256
+ env = {
257
+ 'PGPASSWORD' => config[:password]
258
+ }
259
+
260
+ cmd = [
261
+ 'psql',
262
+ '-h', config[:host],
263
+ '-p', config[:port].to_s,
264
+ '-U', config[:user],
265
+ '-d', config[:dbname],
266
+ '-f', schema_file,
267
+ '--quiet'
268
+ ]
269
+
270
+ # Execute psql
271
+ require 'open3'
272
+ stdout, stderr, status = Open3.capture3(env, *cmd)
273
+
274
+ unless status.success?
275
+ puts "✗ Error loading schema:"
276
+ puts stderr
277
+ exit 1
278
+ end
279
+
280
+ puts "✓ Schema loaded successfully"
281
+ end
282
+
283
+ # Generate database documentation using tbls
284
+ #
285
+ # Creates comprehensive database documentation in dbdoc/ directory including:
286
+ # - Entity-relationship diagrams
287
+ # - Table schemas with comments
288
+ # - Index information
289
+ # - Relationship diagrams
290
+ #
291
+ # @param db_url [String] Database connection URL (uses ENV['HTM_DBURL'] if not provided)
292
+ # @return [void]
293
+ #
294
+ def generate_docs(db_url = nil)
295
+ config = parse_connection_url(db_url || ENV['HTM_DBURL'])
296
+ raise "Database configuration not found" unless config
297
+
298
+ dbdoc_dir = File.expand_path('../../dbdoc', __dir__)
299
+
300
+ puts "Generating database documentation in #{dbdoc_dir}..."
301
+
302
+ # Create dbdoc directory if it doesn't exist
303
+ Dir.mkdir(dbdoc_dir) unless Dir.exist?(dbdoc_dir)
304
+
305
+ # Build PostgreSQL connection string for tbls
306
+ pg_url = if config[:password]
307
+ "postgresql://#{config[:user]}:#{config[:password]}@#{config[:host]}:#{config[:port]}/#{config[:dbname]}?sslmode=#{config[:sslmode] || 'prefer'}"
308
+ else
309
+ "postgresql://#{config[:user]}@#{config[:host]}:#{config[:port]}/#{config[:dbname]}?sslmode=#{config[:sslmode] || 'prefer'}"
310
+ end
311
+
312
+ # Check if tbls is installed
313
+ unless system('which tbls > /dev/null 2>&1')
314
+ puts "✗ Error: 'tbls' is not installed"
315
+ puts ""
316
+ puts "Install tbls:"
317
+ puts " brew install k1LoW/tap/tbls"
318
+ puts " # or"
319
+ puts " go install github.com/k1LoW/tbls@latest"
320
+ puts ""
321
+ puts "See: https://github.com/k1LoW/tbls"
322
+ exit 1
323
+ end
324
+
325
+ # Run tbls doc command with --force to allow updates
326
+ require 'open3'
327
+ cmd = ['tbls', 'doc', '--force', pg_url, dbdoc_dir]
328
+
329
+ stdout, stderr, status = Open3.capture3(*cmd)
330
+
331
+ unless status.success?
332
+ puts "✗ Error generating documentation:"
333
+ puts stderr
334
+ puts stdout
335
+ exit 1
336
+ end
337
+
338
+ puts stdout if stdout && !stdout.empty?
339
+ puts "✓ Database documentation generated successfully"
340
+ puts ""
341
+ puts "Documentation files:"
342
+ puts " #{dbdoc_dir}/README.md - Main documentation"
343
+ puts " #{dbdoc_dir}/schema.svg - ER diagram (if generated)"
344
+ puts " #{dbdoc_dir}/*.md - Individual table documentation"
345
+ puts ""
346
+ puts "View documentation:"
347
+ puts " open #{dbdoc_dir}/README.md"
348
+ end
349
+
350
+ # Show database info
351
+ #
352
+ # @param db_url [String] Database connection URL (uses ENV['HTM_DBURL'] if not provided)
353
+ # @return [void]
354
+ #
355
+ def info(db_url = nil)
356
+ config = parse_connection_url(db_url || ENV['HTM_DBURL'])
357
+ raise "Database configuration not found" unless config
358
+
359
+ conn = PG.connect(config)
360
+
361
+ puts "\nHTM Database Information"
362
+ puts "=" * 80
363
+
364
+ # Connection info
365
+ puts "\nConnection:"
366
+ puts " Host: #{config[:host]}"
367
+ puts " Port: #{config[:port]}"
368
+ puts " Database: #{config[:dbname]}"
369
+ puts " User: #{config[:user]}"
370
+
371
+ # PostgreSQL version
372
+ version = conn.exec("SELECT version()").first['version']
373
+ puts "\nPostgreSQL Version:"
374
+ puts " #{version.split(',').first}"
375
+
376
+ # Extensions
377
+ puts "\nExtensions:"
378
+ extensions = conn.exec("SELECT extname, extversion FROM pg_extension ORDER BY extname").to_a
379
+ extensions.each do |ext|
380
+ puts " #{ext['extname']} (#{ext['extversion']})"
381
+ end
382
+
383
+ # Table info
384
+ puts "\nHTM Tables:"
385
+ tables = ['nodes', 'tags', 'robots', 'operations_log', 'schema_migrations']
386
+ tables.each do |table|
387
+ begin
388
+ count = conn.exec("SELECT COUNT(*) FROM #{table}").first['count']
389
+ puts " #{table}: #{count} rows"
390
+ rescue PG::UndefinedTable
391
+ puts " #{table}: not created"
392
+ end
393
+ end
394
+
395
+ # Database size
396
+ db_size = conn.exec(
397
+ "SELECT pg_size_pretty(pg_database_size($1)) AS size",
398
+ [config[:dbname]]
399
+ ).first['size']
400
+ puts "\nDatabase Size: #{db_size}"
401
+
402
+ conn.close
403
+ puts "=" * 80
404
+ end
405
+
406
+ # Parse database connection URL
407
+ #
408
+ # @param url [String] Connection URL
409
+ # @return [Hash, nil] Connection configuration hash
410
+ #
411
+ def parse_connection_url(url)
412
+ return nil unless url
413
+
414
+ uri = URI.parse(url)
415
+ params = URI.decode_www_form(uri.query || '').to_h
416
+
417
+ {
418
+ host: uri.host,
419
+ port: uri.port,
420
+ dbname: uri.path[1..-1], # Remove leading /
421
+ user: uri.user,
422
+ password: uri.password,
423
+ sslmode: params['sslmode'] || 'prefer'
424
+ }
425
+ end
426
+
427
+ # Build config from individual environment variables
428
+ #
429
+ # @return [Hash, nil] Connection configuration hash
430
+ #
431
+ def parse_connection_params
432
+ return nil unless ENV['HTM_DBNAME']
433
+
434
+ {
435
+ host: ENV['HTM_DBHOST'] || 'cw7rxj91bm.srbbwwxn56.tsdb.cloud.timescale.com',
436
+ port: (ENV['HTM_DBPORT'] || 37807).to_i,
437
+ dbname: ENV['HTM_DBNAME'],
438
+ user: ENV['HTM_DBUSER'],
439
+ password: ENV['HTM_DBPASS'],
440
+ sslmode: 'require'
441
+ }
442
+ end
443
+
444
+ # Get default database configuration
445
+ #
446
+ # @return [Hash, nil] Connection configuration hash
447
+ #
448
+ def default_config
449
+ # Prefer HTM_DBURL if available
450
+ if ENV['HTM_DBURL']
451
+ parse_connection_url(ENV['HTM_DBURL'])
452
+ elsif ENV['HTM_DBNAME']
453
+ parse_connection_params
454
+ else
455
+ nil
456
+ end
457
+ end
458
+
459
+ private
460
+
461
+ def verify_extensions(conn)
462
+ # Check pgvector
463
+ pgvector = conn.exec("SELECT extversion FROM pg_extension WHERE extname='vector'").first
464
+ if pgvector
465
+ puts "✓ pgvector version: #{pgvector['extversion']}"
466
+ else
467
+ puts "⚠ Warning: pgvector extension not found"
468
+ end
469
+
470
+ # Check pg_trgm
471
+ pg_trgm = conn.exec("SELECT extversion FROM pg_extension WHERE extname='pg_trgm'").first
472
+ if pg_trgm
473
+ puts "✓ pg_trgm version: #{pg_trgm['extversion']}"
474
+ else
475
+ puts "⚠ Warning: pg_trgm extension not found"
476
+ end
477
+ end
478
+
479
+ # Run ActiveRecord migrations from db/migrate/
480
+ #
481
+ # @return [void]
482
+ #
483
+ def run_activerecord_migrations
484
+ migrations_path = File.expand_path('../../db/migrate', __dir__)
485
+
486
+ unless Dir.exist?(migrations_path)
487
+ puts "⚠ No migrations directory found at #{migrations_path}"
488
+ return
489
+ end
490
+
491
+ conn = ActiveRecord::Base.connection
492
+
493
+ # Create schema_migrations table if it doesn't exist
494
+ unless conn.table_exists?('schema_migrations')
495
+ conn.create_table(:schema_migrations, id: false) do |t|
496
+ t.string :version, null: false, primary_key: true
497
+ end
498
+ end
499
+
500
+ # Get list of migration files
501
+ migration_files = Dir.glob("#{migrations_path}/*.rb").sort
502
+ puts "Found #{migration_files.length} migration files"
503
+
504
+ # Run each migration
505
+ migration_files.each do |file|
506
+ version = File.basename(file).split('_').first
507
+ name = File.basename(file, '.rb')
508
+
509
+ # Check if already run
510
+ already_run = conn.select_value(
511
+ "SELECT COUNT(*) FROM schema_migrations WHERE version = '#{version}'"
512
+ ).to_i > 0
513
+
514
+ if already_run
515
+ puts " ✓ #{name} (already migrated)"
516
+ else
517
+ puts " → Running #{name}..."
518
+ require file
519
+
520
+ # Get the migration class
521
+ class_name = name.split('_')[1..].map(&:capitalize).join
522
+ migration_class = Object.const_get(class_name)
523
+
524
+ # Run the migration
525
+ migration = migration_class.new
526
+ migration.migrate(:up)
527
+
528
+ # Record in schema_migrations
529
+ conn.execute(
530
+ "INSERT INTO schema_migrations (version) VALUES ('#{version}')"
531
+ )
532
+
533
+ puts " ✓ Completed"
534
+ end
535
+ end
536
+
537
+ puts "✓ All migrations completed"
538
+ end
539
+
540
+ # Clean up pg_dump output to make it more readable
541
+ #
542
+ # @param schema_dump [String] Raw pg_dump output
543
+ # @return [String] Cleaned schema
544
+ #
545
+ def clean_schema_dump(schema_dump)
546
+ lines = schema_dump.split("\n")
547
+ cleaned = []
548
+
549
+ # Add header
550
+ cleaned << "-- HTM Database Schema"
551
+ cleaned << "-- Auto-generated from database using pg_dump"
552
+ cleaned << "-- DO NOT EDIT THIS FILE MANUALLY"
553
+ cleaned << "-- Run 'rake htm:db:schema:dump' to regenerate"
554
+ cleaned << ""
555
+
556
+ # Skip pg_dump header comments
557
+ skip_until_content = true
558
+
559
+ lines.each do |line|
560
+ # Skip header comments
561
+ if skip_until_content
562
+ if line =~ /^(SET|CREATE|ALTER|--\s*Name:|COMMENT)/
563
+ skip_until_content = false
564
+ else
565
+ next
566
+ end
567
+ end
568
+
569
+ # Skip SET commands (session-specific settings)
570
+ next if line =~ /^SET /
571
+
572
+ # Skip SELECT pg_catalog.set_config
573
+ next if line =~ /^SELECT pg_catalog\.set_config/
574
+
575
+ # Skip extension comments (we keep extension creation)
576
+ next if line =~ /^COMMENT ON EXTENSION/
577
+
578
+ # Keep everything else
579
+ cleaned << line
580
+ end
581
+
582
+ # Remove multiple blank lines
583
+ result = cleaned.join("\n")
584
+ result.gsub!(/\n{3,}/, "\n\n")
585
+
586
+ result
587
+ end
588
+
589
+ # Old methods removed - now using ActiveRecord migrations
590
+ # def run_schema(conn) - REMOVED
591
+ # def run_migrations_if_needed(conn) - REMOVED (see run_activerecord_migrations above)
592
+ end
593
+ end
594
+ end
@@ -0,0 +1,115 @@
1
+ # frozen_string_literal: true
2
+
3
+ require_relative 'errors'
4
+
5
+ class HTM
6
+ # Embedding Service - Processes and validates vector embeddings
7
+ #
8
+ # This service wraps the configured embedding generator and provides:
9
+ # - Response validation
10
+ # - Dimension handling (padding/truncation)
11
+ # - Error handling and logging
12
+ # - Storage formatting
13
+ #
14
+ # The actual LLM call is delegated to HTM.configuration.embedding_generator
15
+ #
16
+ class EmbeddingService
17
+ MAX_DIMENSION = 2000 # Maximum dimension for pgvector HNSW index
18
+
19
+ # Generate embedding with validation and processing
20
+ #
21
+ # @param text [String] Text to embed
22
+ # @return [Hash] Processed embedding with metadata
23
+ # {
24
+ # embedding: Array<Float>, # Original embedding
25
+ # dimension: Integer, # Original dimension
26
+ # storage_embedding: String, # Formatted for database storage
27
+ # storage_dimension: Integer # Padded dimension (2000)
28
+ # }
29
+ #
30
+ def self.generate(text)
31
+ HTM.logger.debug "EmbeddingService: Generating embedding for #{text.length} chars"
32
+
33
+ # Call configured embedding generator
34
+ raw_embedding = HTM.configuration.embedding_generator.call(text)
35
+
36
+ # Validate response
37
+ validate_embedding!(raw_embedding)
38
+
39
+ # Get actual dimension
40
+ actual_dimension = raw_embedding.length
41
+
42
+ # Check dimension limit
43
+ if actual_dimension > MAX_DIMENSION
44
+ HTM.logger.warn "EmbeddingService: Embedding dimension #{actual_dimension} exceeds max #{MAX_DIMENSION}, truncating"
45
+ raw_embedding = raw_embedding[0...MAX_DIMENSION]
46
+ actual_dimension = MAX_DIMENSION
47
+ end
48
+
49
+ # Pad to 2000 dimensions for consistent storage
50
+ storage_embedding = pad_embedding(raw_embedding)
51
+
52
+ # Format for database storage
53
+ storage_string = format_for_storage(storage_embedding)
54
+
55
+ HTM.logger.debug "EmbeddingService: Generated #{actual_dimension}D embedding (padded to #{MAX_DIMENSION})"
56
+
57
+ {
58
+ embedding: raw_embedding,
59
+ dimension: actual_dimension,
60
+ storage_embedding: storage_string,
61
+ storage_dimension: MAX_DIMENSION
62
+ }
63
+
64
+ rescue HTM::EmbeddingError
65
+ raise
66
+ rescue StandardError => e
67
+ HTM.logger.error "EmbeddingService: Failed to generate embedding: #{e.message}"
68
+ raise HTM::EmbeddingError, "Embedding generation failed: #{e.message}"
69
+ end
70
+
71
+ # Validate embedding response format
72
+ #
73
+ # @param embedding [Object] Raw embedding from generator
74
+ # @raise [HTM::EmbeddingError] if invalid
75
+ #
76
+ def self.validate_embedding!(embedding)
77
+ unless embedding.is_a?(Array)
78
+ raise HTM::EmbeddingError, "Embedding must be an Array, got #{embedding.class}"
79
+ end
80
+
81
+ if embedding.empty?
82
+ raise HTM::EmbeddingError, "Embedding array is empty"
83
+ end
84
+
85
+ unless embedding.all? { |v| v.is_a?(Numeric) }
86
+ raise HTM::EmbeddingError, "Embedding must contain only numeric values"
87
+ end
88
+
89
+ # Check for NaN or Infinity
90
+ if embedding.any? { |v| v.respond_to?(:nan?) && v.nan? || v.respond_to?(:infinite?) && v.infinite? }
91
+ raise HTM::EmbeddingError, "Embedding contains NaN or Infinity values"
92
+ end
93
+ end
94
+
95
+ # Pad embedding to MAX_DIMENSION with zeros
96
+ #
97
+ # @param embedding [Array<Float>] Original embedding
98
+ # @return [Array<Float>] Padded embedding
99
+ #
100
+ def self.pad_embedding(embedding)
101
+ return embedding if embedding.length >= MAX_DIMENSION
102
+
103
+ embedding + Array.new(MAX_DIMENSION - embedding.length, 0.0)
104
+ end
105
+
106
+ # Format embedding for database storage
107
+ #
108
+ # @param embedding [Array<Float>] Padded embedding
109
+ # @return [String] PostgreSQL array format
110
+ #
111
+ def self.format_for_storage(embedding)
112
+ "[#{embedding.join(',')}]"
113
+ end
114
+ end
115
+ end