htm 0.0.1 → 0.0.10

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (184) hide show
  1. checksums.yaml +4 -4
  2. data/.aigcm_msg +1 -0
  3. data/.architecture/reviews/comprehensive-codebase-review.md +577 -0
  4. data/.claude/settings.local.json +92 -0
  5. data/.envrc +1 -0
  6. data/.irbrc +283 -80
  7. data/.tbls.yml +31 -0
  8. data/CHANGELOG.md +314 -16
  9. data/CLAUDE.md +603 -0
  10. data/README.md +76 -5
  11. data/Rakefile +5 -0
  12. data/SETUP.md +132 -101
  13. data/db/migrate/{20250101000001_enable_extensions.rb → 00001_enable_extensions.rb} +0 -1
  14. data/db/migrate/00002_create_robots.rb +11 -0
  15. data/db/migrate/00003_create_file_sources.rb +20 -0
  16. data/db/migrate/00004_create_nodes.rb +65 -0
  17. data/db/migrate/00005_create_tags.rb +13 -0
  18. data/db/migrate/00006_create_node_tags.rb +18 -0
  19. data/db/migrate/00007_create_robot_nodes.rb +26 -0
  20. data/db/migrate/00009_add_working_memory_to_robot_nodes.rb +12 -0
  21. data/db/schema.sql +390 -36
  22. data/docs/api/database.md +19 -232
  23. data/docs/api/embedding-service.md +1 -7
  24. data/docs/api/htm.md +305 -364
  25. data/docs/api/index.md +1 -7
  26. data/docs/api/long-term-memory.md +342 -590
  27. data/docs/api/yard/HTM/ActiveRecordConfig.md +23 -0
  28. data/docs/api/yard/HTM/AuthorizationError.md +11 -0
  29. data/docs/api/yard/HTM/CircuitBreaker.md +92 -0
  30. data/docs/api/yard/HTM/CircuitBreakerOpenError.md +34 -0
  31. data/docs/api/yard/HTM/Configuration.md +175 -0
  32. data/docs/api/yard/HTM/Database.md +99 -0
  33. data/docs/api/yard/HTM/DatabaseError.md +14 -0
  34. data/docs/api/yard/HTM/EmbeddingError.md +18 -0
  35. data/docs/api/yard/HTM/EmbeddingService.md +58 -0
  36. data/docs/api/yard/HTM/Error.md +11 -0
  37. data/docs/api/yard/HTM/JobAdapter.md +39 -0
  38. data/docs/api/yard/HTM/LongTermMemory.md +342 -0
  39. data/docs/api/yard/HTM/NotFoundError.md +17 -0
  40. data/docs/api/yard/HTM/Observability.md +107 -0
  41. data/docs/api/yard/HTM/QueryTimeoutError.md +19 -0
  42. data/docs/api/yard/HTM/Railtie.md +27 -0
  43. data/docs/api/yard/HTM/ResourceExhaustedError.md +13 -0
  44. data/docs/api/yard/HTM/TagError.md +18 -0
  45. data/docs/api/yard/HTM/TagService.md +67 -0
  46. data/docs/api/yard/HTM/Timeframe/Result.md +24 -0
  47. data/docs/api/yard/HTM/Timeframe.md +40 -0
  48. data/docs/api/yard/HTM/TimeframeExtractor/Result.md +24 -0
  49. data/docs/api/yard/HTM/TimeframeExtractor.md +45 -0
  50. data/docs/api/yard/HTM/ValidationError.md +20 -0
  51. data/docs/api/yard/HTM/WorkingMemory.md +131 -0
  52. data/docs/api/yard/HTM.md +80 -0
  53. data/docs/api/yard/index.csv +179 -0
  54. data/docs/api/yard-reference.md +51 -0
  55. data/docs/architecture/adrs/001-postgresql-timescaledb.md +1 -1
  56. data/docs/architecture/adrs/003-ollama-embeddings.md +1 -1
  57. data/docs/architecture/adrs/010-redis-working-memory-rejected.md +2 -27
  58. data/docs/architecture/adrs/index.md +2 -13
  59. data/docs/architecture/hive-mind.md +165 -166
  60. data/docs/architecture/index.md +2 -2
  61. data/docs/architecture/overview.md +5 -171
  62. data/docs/architecture/two-tier-memory.md +1 -35
  63. data/docs/assets/images/adr-010-current-architecture.svg +37 -0
  64. data/docs/assets/images/adr-010-proposed-architecture.svg +48 -0
  65. data/docs/assets/images/adr-dependency-tree.svg +93 -0
  66. data/docs/assets/images/class-hierarchy.svg +55 -0
  67. data/docs/assets/images/exception-hierarchy.svg +45 -0
  68. data/docs/assets/images/htm-architecture-overview.svg +83 -0
  69. data/docs/assets/images/htm-complete-memory-flow.svg +160 -0
  70. data/docs/assets/images/htm-context-assembly-flow.svg +148 -0
  71. data/docs/assets/images/htm-eviction-process.svg +141 -0
  72. data/docs/assets/images/htm-memory-addition-flow.svg +138 -0
  73. data/docs/assets/images/htm-memory-recall-flow.svg +152 -0
  74. data/docs/assets/images/htm-node-states.svg +123 -0
  75. data/docs/assets/images/project-structure.svg +78 -0
  76. data/docs/assets/images/test-directory-structure.svg +38 -0
  77. data/{dbdoc → docs/database}/README.md +127 -125
  78. data/docs/database/public.file_sources.md +42 -0
  79. data/docs/database/public.file_sources.svg +211 -0
  80. data/{dbdoc → docs/database}/public.node_tags.md +7 -8
  81. data/docs/database/public.node_tags.svg +239 -0
  82. data/{dbdoc → docs/database}/public.nodes.md +22 -17
  83. data/docs/database/public.nodes.svg +271 -0
  84. data/docs/database/public.robot_nodes.md +46 -0
  85. data/docs/database/public.robot_nodes.svg +243 -0
  86. data/{dbdoc → docs/database}/public.robots.md +2 -3
  87. data/docs/database/public.robots.svg +161 -0
  88. data/docs/database/public.tags.svg +139 -0
  89. data/{dbdoc → docs/database}/schema.json +941 -630
  90. data/docs/database/schema.svg +282 -0
  91. data/docs/development/index.md +1 -29
  92. data/docs/development/schema.md +134 -309
  93. data/docs/development/testing.md +1 -9
  94. data/docs/getting-started/index.md +47 -0
  95. data/docs/{installation.md → getting-started/installation.md} +2 -2
  96. data/docs/{quick-start.md → getting-started/quick-start.md} +5 -5
  97. data/docs/guides/adding-memories.md +295 -643
  98. data/docs/guides/recalling-memories.md +36 -1
  99. data/docs/guides/search-strategies.md +85 -51
  100. data/docs/images/htm-er-diagram.svg +156 -0
  101. data/docs/index.md +16 -31
  102. data/docs/multi_framework_support.md +4 -4
  103. data/examples/README.md +280 -0
  104. data/examples/basic_usage.rb +18 -16
  105. data/examples/cli_app/htm_cli.rb +146 -8
  106. data/examples/cli_app/temp.log +93 -0
  107. data/examples/custom_llm_configuration.rb +1 -2
  108. data/examples/example_app/app.rb +11 -14
  109. data/examples/file_loader_usage.rb +177 -0
  110. data/examples/robot_groups/lib/robot_group.rb +419 -0
  111. data/examples/robot_groups/lib/working_memory_channel.rb +140 -0
  112. data/examples/robot_groups/multi_process.rb +286 -0
  113. data/examples/robot_groups/robot_worker.rb +136 -0
  114. data/examples/robot_groups/same_process.rb +229 -0
  115. data/examples/sinatra_app/Gemfile +1 -0
  116. data/examples/sinatra_app/Gemfile.lock +166 -0
  117. data/examples/sinatra_app/app.rb +219 -24
  118. data/examples/timeframe_demo.rb +276 -0
  119. data/lib/htm/active_record_config.rb +10 -3
  120. data/lib/htm/circuit_breaker.rb +202 -0
  121. data/lib/htm/configuration.rb +313 -80
  122. data/lib/htm/database.rb +67 -36
  123. data/lib/htm/embedding_service.rb +39 -2
  124. data/lib/htm/errors.rb +131 -11
  125. data/lib/htm/{sinatra.rb → integrations/sinatra.rb} +87 -12
  126. data/lib/htm/job_adapter.rb +10 -3
  127. data/lib/htm/jobs/generate_embedding_job.rb +5 -4
  128. data/lib/htm/jobs/generate_tags_job.rb +4 -0
  129. data/lib/htm/loaders/markdown_loader.rb +263 -0
  130. data/lib/htm/loaders/paragraph_chunker.rb +112 -0
  131. data/lib/htm/long_term_memory.rb +601 -321
  132. data/lib/htm/models/file_source.rb +99 -0
  133. data/lib/htm/models/node.rb +116 -12
  134. data/lib/htm/models/robot.rb +53 -4
  135. data/lib/htm/models/robot_node.rb +51 -0
  136. data/lib/htm/models/tag.rb +302 -0
  137. data/lib/htm/observability.rb +395 -0
  138. data/lib/htm/tag_service.rb +60 -3
  139. data/lib/htm/tasks.rb +29 -0
  140. data/lib/htm/timeframe.rb +194 -0
  141. data/lib/htm/timeframe_extractor.rb +307 -0
  142. data/lib/htm/version.rb +1 -1
  143. data/lib/htm/working_memory.rb +165 -70
  144. data/lib/htm.rb +352 -133
  145. data/lib/tasks/doc.rake +300 -0
  146. data/lib/tasks/files.rake +299 -0
  147. data/lib/tasks/htm.rake +188 -2
  148. data/lib/tasks/jobs.rake +10 -12
  149. data/lib/tasks/tags.rake +194 -0
  150. data/mkdocs.yml +91 -9
  151. data/notes/ARCHITECTURE_REVIEW.md +1167 -0
  152. data/notes/IMPLEMENTATION_SUMMARY.md +606 -0
  153. data/notes/MULTI_FRAMEWORK_IMPLEMENTATION.md +451 -0
  154. data/notes/next_steps.md +100 -0
  155. data/notes/plan.md +627 -0
  156. data/notes/tag_ontology_enhancement_ideas.md +222 -0
  157. data/notes/timescaledb_removal_summary.md +200 -0
  158. metadata +177 -37
  159. data/db/migrate/20250101000002_create_robots.rb +0 -14
  160. data/db/migrate/20250101000003_create_nodes.rb +0 -42
  161. data/db/migrate/20250101000005_create_tags.rb +0 -38
  162. data/db/migrate/20250101000007_add_node_vector_indexes.rb +0 -30
  163. data/dbdoc/public.node_tags.svg +0 -112
  164. data/dbdoc/public.nodes.svg +0 -118
  165. data/dbdoc/public.robots.svg +0 -90
  166. data/dbdoc/public.tags.svg +0 -60
  167. data/dbdoc/schema.svg +0 -154
  168. data/{dbdoc → docs/database}/public.node_stats.md +0 -0
  169. data/{dbdoc → docs/database}/public.node_stats.svg +0 -0
  170. data/{dbdoc → docs/database}/public.nodes_tags.md +0 -0
  171. data/{dbdoc → docs/database}/public.nodes_tags.svg +0 -0
  172. data/{dbdoc → docs/database}/public.ontology_structure.md +0 -0
  173. data/{dbdoc → docs/database}/public.ontology_structure.svg +0 -0
  174. data/{dbdoc → docs/database}/public.operations_log.md +0 -0
  175. data/{dbdoc → docs/database}/public.operations_log.svg +0 -0
  176. data/{dbdoc → docs/database}/public.relationships.md +0 -0
  177. data/{dbdoc → docs/database}/public.relationships.svg +0 -0
  178. data/{dbdoc → docs/database}/public.robot_activity.md +0 -0
  179. data/{dbdoc → docs/database}/public.robot_activity.svg +0 -0
  180. data/{dbdoc → docs/database}/public.schema_migrations.md +0 -0
  181. data/{dbdoc → docs/database}/public.schema_migrations.svg +0 -0
  182. data/{dbdoc → docs/database}/public.tags.md +3 -3
  183. /data/{dbdoc → docs/database}/public.topic_relationships.md +0 -0
  184. /data/{dbdoc → docs/database}/public.topic_relationships.svg +0 -0
@@ -0,0 +1,395 @@
1
+ # frozen_string_literal: true
2
+
3
+ class HTM
4
+ # Observability module for monitoring and metrics collection
5
+ #
6
+ # Provides comprehensive monitoring of HTM components including:
7
+ # - Connection pool health monitoring with alerts
8
+ # - Query timing and performance metrics
9
+ # - Cache efficiency tracking
10
+ # - Service health checks
11
+ # - Memory usage statistics
12
+ #
13
+ # @example Basic usage
14
+ # stats = HTM::Observability.collect_all
15
+ # puts stats[:connection_pool][:status] # => :healthy
16
+ #
17
+ # @example Connection pool monitoring
18
+ # pool_stats = HTM::Observability.connection_pool_stats
19
+ # if pool_stats[:status] == :exhausted
20
+ # logger.error "Connection pool exhausted!"
21
+ # end
22
+ #
23
+ # @example Health check
24
+ # if HTM::Observability.healthy?
25
+ # puts "All systems operational"
26
+ # else
27
+ # puts "Health check failed: #{HTM::Observability.health_check[:issues]}"
28
+ # end
29
+ #
30
+ module Observability
31
+ # Connection pool utilization thresholds
32
+ POOL_WARNING_THRESHOLD = 0.75 # 75% utilization triggers warning
33
+ POOL_CRITICAL_THRESHOLD = 0.90 # 90% utilization triggers critical
34
+
35
+ # Timing metrics storage (thread-safe)
36
+ @metrics_mutex = Mutex.new
37
+ @query_timings = []
38
+ @embedding_timings = []
39
+ @tag_extraction_timings = []
40
+ @max_timing_samples = 1000
41
+
42
+ class << self
43
+ # Collect all observability metrics
44
+ #
45
+ # @return [Hash] Comprehensive metrics including:
46
+ # - :connection_pool - Pool stats with health status
47
+ # - :cache - Query cache hit rates and size
48
+ # - :circuit_breakers - Service circuit breaker states
49
+ # - :query_timings - Recent query performance
50
+ # - :service_timings - Embedding/tag generation times
51
+ # - :memory_usage - System memory stats
52
+ #
53
+ def collect_all
54
+ {
55
+ connection_pool: connection_pool_stats,
56
+ cache: cache_stats,
57
+ circuit_breakers: circuit_breaker_stats,
58
+ query_timings: query_timing_stats,
59
+ service_timings: service_timing_stats,
60
+ memory_usage: memory_stats,
61
+ collected_at: Time.current
62
+ }
63
+ end
64
+
65
+ # Get connection pool statistics with health status
66
+ #
67
+ # @return [Hash] Pool statistics including:
68
+ # - :size - Maximum pool size
69
+ # - :connections - Current total connections
70
+ # - :in_use - Connections currently checked out
71
+ # - :available - Connections available for checkout
72
+ # - :utilization - Usage percentage (0.0-1.0)
73
+ # - :status - Health status (:healthy, :warning, :critical, :exhausted)
74
+ # - :wait_timeout - Connection wait timeout (ms)
75
+ #
76
+ def connection_pool_stats
77
+ return { status: :unavailable, message: "ActiveRecord not connected" } unless connected?
78
+
79
+ pool = ActiveRecord::Base.connection_pool
80
+
81
+ size = pool.size
82
+ connections = pool.connections.size
83
+ in_use = pool.connections.count(&:in_use?)
84
+ available = connections - in_use
85
+
86
+ # Calculate utilization based on connections in use vs pool size
87
+ utilization = size > 0 ? in_use.to_f / size : 0.0
88
+
89
+ # Determine health status
90
+ status = case
91
+ when available == 0 && in_use >= size
92
+ :exhausted
93
+ when utilization >= POOL_CRITICAL_THRESHOLD
94
+ :critical
95
+ when utilization >= POOL_WARNING_THRESHOLD
96
+ :warning
97
+ else
98
+ :healthy
99
+ end
100
+
101
+ stats = {
102
+ size: size,
103
+ connections: connections,
104
+ in_use: in_use,
105
+ available: available,
106
+ utilization: (utilization * 100).round(2),
107
+ status: status,
108
+ wait_timeout: pool.checkout_timeout * 1000 # Convert to ms
109
+ }
110
+
111
+ # Log warnings if pool is stressed
112
+ log_pool_status(stats)
113
+
114
+ stats
115
+ rescue StandardError => e
116
+ { status: :error, message: e.message }
117
+ end
118
+
119
+ # Get query cache statistics
120
+ #
121
+ # @return [Hash, nil] Cache stats or nil if unavailable
122
+ #
123
+ def cache_stats
124
+ # Try to access LongTermMemory cache stats
125
+ # Note: This requires access to an LTM instance
126
+ {
127
+ info: "Cache stats available via LongTermMemory#stats[:cache]"
128
+ }
129
+ end
130
+
131
+ # Get circuit breaker states for all services
132
+ #
133
+ # @return [Hash] Circuit breaker states:
134
+ # - :embedding_service - State and failure count
135
+ # - :tag_service - State and failure count
136
+ #
137
+ def circuit_breaker_stats
138
+ stats = {}
139
+
140
+ if defined?(HTM::EmbeddingService)
141
+ cb = HTM::EmbeddingService.circuit_breaker
142
+ stats[:embedding_service] = {
143
+ state: cb.state,
144
+ failure_count: cb.failure_count,
145
+ last_failure_time: cb.last_failure_time
146
+ }
147
+ end
148
+
149
+ if defined?(HTM::TagService)
150
+ cb = HTM::TagService.circuit_breaker
151
+ stats[:tag_service] = {
152
+ state: cb.state,
153
+ failure_count: cb.failure_count,
154
+ last_failure_time: cb.last_failure_time
155
+ }
156
+ end
157
+
158
+ stats
159
+ rescue StandardError => e
160
+ { error: e.message }
161
+ end
162
+
163
+ # Record query timing for metrics
164
+ #
165
+ # @param duration_ms [Float] Query duration in milliseconds
166
+ # @param query_type [Symbol] Type of query (:vector, :fulltext, :hybrid)
167
+ #
168
+ def record_query_timing(duration_ms, query_type: :unknown)
169
+ @metrics_mutex.synchronize do
170
+ @query_timings << {
171
+ duration_ms: duration_ms,
172
+ query_type: query_type,
173
+ recorded_at: Time.current
174
+ }
175
+
176
+ # Keep only recent samples
177
+ @query_timings.shift if @query_timings.size > @max_timing_samples
178
+ end
179
+ end
180
+
181
+ # Record embedding generation timing
182
+ #
183
+ # @param duration_ms [Float] Generation duration in milliseconds
184
+ #
185
+ def record_embedding_timing(duration_ms)
186
+ @metrics_mutex.synchronize do
187
+ @embedding_timings << {
188
+ duration_ms: duration_ms,
189
+ recorded_at: Time.current
190
+ }
191
+ @embedding_timings.shift if @embedding_timings.size > @max_timing_samples
192
+ end
193
+ end
194
+
195
+ # Record tag extraction timing
196
+ #
197
+ # @param duration_ms [Float] Extraction duration in milliseconds
198
+ #
199
+ def record_tag_timing(duration_ms)
200
+ @metrics_mutex.synchronize do
201
+ @tag_extraction_timings << {
202
+ duration_ms: duration_ms,
203
+ recorded_at: Time.current
204
+ }
205
+ @tag_extraction_timings.shift if @tag_extraction_timings.size > @max_timing_samples
206
+ end
207
+ end
208
+
209
+ # Get query timing statistics
210
+ #
211
+ # @return [Hash] Timing statistics including avg, min, max, p95
212
+ #
213
+ def query_timing_stats
214
+ calculate_timing_stats(@query_timings, :query)
215
+ end
216
+
217
+ # Get service timing statistics (embedding and tag extraction)
218
+ #
219
+ # @return [Hash] Timing stats for embedding and tag services
220
+ #
221
+ def service_timing_stats
222
+ {
223
+ embedding: calculate_timing_stats(@embedding_timings, :embedding),
224
+ tag_extraction: calculate_timing_stats(@tag_extraction_timings, :tag)
225
+ }
226
+ end
227
+
228
+ # Get memory usage statistics
229
+ #
230
+ # @return [Hash] Memory stats
231
+ #
232
+ def memory_stats
233
+ {
234
+ process_rss_mb: process_memory_mb,
235
+ gc_stats: GC.stat.slice(:count, :heap_allocated_pages, :heap_live_slots)
236
+ }
237
+ rescue StandardError
238
+ { available: false }
239
+ end
240
+
241
+ # Perform comprehensive health check
242
+ #
243
+ # @return [Hash] Health check results:
244
+ # - :healthy - Boolean overall health status
245
+ # - :checks - Individual check results
246
+ # - :issues - Array of identified issues
247
+ #
248
+ def health_check
249
+ checks = {}
250
+ issues = []
251
+
252
+ # Check database connection
253
+ checks[:database] = connected?
254
+ issues << "Database not connected" unless checks[:database]
255
+
256
+ # Check connection pool
257
+ pool_stats = connection_pool_stats
258
+ checks[:connection_pool] = pool_stats[:status] == :healthy || pool_stats[:status] == :warning
259
+ issues << "Connection pool #{pool_stats[:status]}" if [:critical, :exhausted].include?(pool_stats[:status])
260
+
261
+ # Check circuit breakers
262
+ cb_stats = circuit_breaker_stats
263
+ if cb_stats[:embedding_service]
264
+ checks[:embedding_circuit] = cb_stats[:embedding_service][:state] != :open
265
+ issues << "Embedding service circuit breaker open" unless checks[:embedding_circuit]
266
+ end
267
+ if cb_stats[:tag_service]
268
+ checks[:tag_circuit] = cb_stats[:tag_service][:state] != :open
269
+ issues << "Tag service circuit breaker open" unless checks[:tag_circuit]
270
+ end
271
+
272
+ # Check required extensions
273
+ if connected?
274
+ begin
275
+ checks[:pgvector] = extension_installed?('vector')
276
+ issues << "pgvector extension not installed" unless checks[:pgvector]
277
+
278
+ checks[:pg_trgm] = extension_installed?('pg_trgm')
279
+ issues << "pg_trgm extension not installed" unless checks[:pg_trgm]
280
+ rescue StandardError => e
281
+ checks[:extensions] = false
282
+ issues << "Failed to check extensions: #{e.message}"
283
+ end
284
+ end
285
+
286
+ {
287
+ healthy: issues.empty?,
288
+ checks: checks,
289
+ issues: issues,
290
+ checked_at: Time.current
291
+ }
292
+ end
293
+
294
+ # Quick health check - returns boolean
295
+ #
296
+ # @return [Boolean] true if system is healthy
297
+ #
298
+ def healthy?
299
+ health_check[:healthy]
300
+ end
301
+
302
+ # Clear all collected timing metrics
303
+ #
304
+ # @return [void]
305
+ #
306
+ def reset_metrics!
307
+ @metrics_mutex.synchronize do
308
+ @query_timings.clear
309
+ @embedding_timings.clear
310
+ @tag_extraction_timings.clear
311
+ end
312
+ end
313
+
314
+ private
315
+
316
+ # Check if ActiveRecord is connected
317
+ def connected?
318
+ return false unless defined?(ActiveRecord::Base)
319
+ ActiveRecord::Base.connected? && ActiveRecord::Base.connection.active?
320
+ rescue StandardError
321
+ false
322
+ end
323
+
324
+ # Check if a PostgreSQL extension is installed
325
+ def extension_installed?(name)
326
+ result = ActiveRecord::Base.connection.select_value(
327
+ ActiveRecord::Base.sanitize_sql_array(
328
+ ["SELECT COUNT(*) FROM pg_extension WHERE extname = ?", name]
329
+ )
330
+ )
331
+ result.to_i > 0
332
+ end
333
+
334
+ # Calculate timing statistics from samples
335
+ def calculate_timing_stats(timings, type)
336
+ @metrics_mutex.synchronize do
337
+ return { sample_count: 0 } if timings.empty?
338
+
339
+ durations = timings.map { |t| t[:duration_ms] }.sort
340
+ count = durations.size
341
+
342
+ {
343
+ sample_count: count,
344
+ avg_ms: (durations.sum / count).round(2),
345
+ min_ms: durations.first.round(2),
346
+ max_ms: durations.last.round(2),
347
+ p50_ms: percentile(durations, 50).round(2),
348
+ p95_ms: percentile(durations, 95).round(2),
349
+ p99_ms: percentile(durations, 99).round(2)
350
+ }
351
+ end
352
+ end
353
+
354
+ # Calculate percentile from sorted array
355
+ def percentile(sorted_array, percentile)
356
+ return 0 if sorted_array.empty?
357
+
358
+ k = (percentile / 100.0 * (sorted_array.size - 1))
359
+ f = k.floor
360
+ c = k.ceil
361
+
362
+ return sorted_array[f] if f == c
363
+
364
+ sorted_array[f] * (c - k) + sorted_array[c] * (k - f)
365
+ end
366
+
367
+ # Get process memory in MB
368
+ def process_memory_mb
369
+ if RUBY_PLATFORM.include?('darwin')
370
+ # macOS: Use ps command
371
+ `ps -o rss= -p #{Process.pid}`.strip.to_i / 1024.0
372
+ elsif File.exist?('/proc/self/status')
373
+ # Linux: Read from proc
374
+ File.read('/proc/self/status').match(/VmRSS:\s+(\d+)/)[1].to_i / 1024.0
375
+ else
376
+ nil
377
+ end
378
+ rescue StandardError
379
+ nil
380
+ end
381
+
382
+ # Log pool status based on health
383
+ def log_pool_status(stats)
384
+ case stats[:status]
385
+ when :exhausted
386
+ HTM.logger.error "Connection pool EXHAUSTED: #{stats[:in_use]}/#{stats[:size]} connections in use (#{stats[:utilization]}%)"
387
+ when :critical
388
+ HTM.logger.warn "Connection pool CRITICAL: #{stats[:in_use]}/#{stats[:size]} connections in use (#{stats[:utilization]}%)"
389
+ when :warning
390
+ HTM.logger.warn "Connection pool WARNING: #{stats[:in_use]}/#{stats[:size]} connections in use (#{stats[:utilization]}%)"
391
+ end
392
+ end
393
+ end
394
+ end
395
+ end
@@ -10,25 +10,59 @@ class HTM
10
10
  # - Format validation (lowercase, alphanumeric, hyphens, colons)
11
11
  # - Depth validation (max 5 levels)
12
12
  # - Ontology consistency
13
+ # - Circuit breaker protection for external LLM failures
13
14
  #
14
15
  # The actual LLM call is delegated to HTM.configuration.tag_extractor
15
16
  #
16
17
  class TagService
17
- MAX_DEPTH = 5 # Maximum hierarchy depth (4 colons)
18
+ MAX_DEPTH = 4 # Maximum hierarchy depth (3 colons)
18
19
  TAG_FORMAT = /^[a-z0-9\-]+(:[a-z0-9\-]+)*$/ # Validation regex
19
20
 
21
+ # Circuit breaker for tag extraction API calls
22
+ @circuit_breaker = nil
23
+ @circuit_breaker_mutex = Mutex.new
24
+
25
+ class << self
26
+ # Get or create the circuit breaker for tag service
27
+ #
28
+ # @return [HTM::CircuitBreaker] The circuit breaker instance
29
+ #
30
+ def circuit_breaker
31
+ @circuit_breaker_mutex.synchronize do
32
+ @circuit_breaker ||= HTM::CircuitBreaker.new(
33
+ name: 'tag_service',
34
+ failure_threshold: 5,
35
+ reset_timeout: 60
36
+ )
37
+ end
38
+ end
39
+
40
+ # Reset the circuit breaker (useful for testing)
41
+ #
42
+ # @return [void]
43
+ #
44
+ def reset_circuit_breaker!
45
+ @circuit_breaker_mutex.synchronize do
46
+ @circuit_breaker&.reset!
47
+ end
48
+ end
49
+ end
50
+
20
51
  # Extract tags with validation and processing
21
52
  #
22
53
  # @param content [String] Text to analyze
23
54
  # @param existing_ontology [Array<String>] Sample of existing tags for context
24
55
  # @return [Array<String>] Validated tag names
56
+ # @raise [CircuitBreakerOpenError] If circuit breaker is open
25
57
  #
26
58
  def self.extract(content, existing_ontology: [])
27
59
  HTM.logger.debug "TagService: Extracting tags from #{content.length} chars"
28
60
  HTM.logger.debug "TagService: Using ontology with #{existing_ontology.size} existing tags"
29
61
 
30
- # Call configured tag extractor
31
- raw_tags = HTM.configuration.tag_extractor.call(content, existing_ontology)
62
+ # Use circuit breaker to protect against cascading failures
63
+ raw_tags = circuit_breaker.call do
64
+ HTM.configuration.tag_extractor.call(content, existing_ontology)
65
+ end
32
66
 
33
67
  # Parse response (may be string or array)
34
68
  parsed_tags = parse_tags(raw_tags)
@@ -40,6 +74,9 @@ class HTM
40
74
 
41
75
  valid_tags
42
76
 
77
+ rescue HTM::CircuitBreakerOpenError
78
+ # Re-raise circuit breaker errors without wrapping
79
+ raise
43
80
  rescue HTM::TagError
44
81
  raise
45
82
  rescue StandardError => e
@@ -87,6 +124,21 @@ class HTM
87
124
  next
88
125
  end
89
126
 
127
+ # Parse hierarchy for ontological validation
128
+ levels = tag.split(':')
129
+
130
+ # Check for self-containment (root == leaf creates circular reference)
131
+ if levels.size > 1 && levels.first == levels.last
132
+ HTM.logger.warn "TagService: Self-containment detected (root == leaf), skipping: #{tag}"
133
+ next
134
+ end
135
+
136
+ # Check for duplicate segments in path (indicates circular/redundant hierarchy)
137
+ if levels.size != levels.uniq.size
138
+ HTM.logger.warn "TagService: Duplicate segment in hierarchy, skipping: #{tag}"
139
+ next
140
+ end
141
+
90
142
  # Tag is valid
91
143
  valid_tags << tag
92
144
  end
@@ -105,6 +157,11 @@ class HTM
105
157
  return false unless tag.match?(TAG_FORMAT)
106
158
  return false if tag.count(':') >= MAX_DEPTH
107
159
 
160
+ # Ontological validation
161
+ levels = tag.split(':')
162
+ return false if levels.size > 1 && levels.first == levels.last # Self-containment
163
+ return false if levels.size != levels.uniq.size # Duplicate segments
164
+
108
165
  true
109
166
  end
110
167
 
data/lib/htm/tasks.rb CHANGED
@@ -28,11 +28,40 @@
28
28
  # rake htm:jobs:failed # Show nodes with processing issues
29
29
  # rake htm:jobs:clear_all # Clear all embeddings and tags (testing)
30
30
  #
31
+ # Tag tasks:
32
+ # rake htm:tags:tree # Display tags as hierarchical tree
33
+ # rake htm:tags:tree[prefix] # Display tags with prefix filter
34
+ # rake htm:tags:mermaid # Export all tags to tags.md (Mermaid)
35
+ # rake htm:tags:mermaid[prefix] # Export filtered tags to tags.md
36
+ # rake htm:tags:svg # Export all tags to tags.svg
37
+ # rake htm:tags:svg[prefix] # Export filtered tags to tags.svg
38
+ # rake htm:tags:export # Export all tags to tags.txt, tags.md, tags.svg
39
+ # rake htm:tags:export[prefix] # Export filtered tags to all formats
40
+ #
41
+ # File loading tasks:
42
+ # rake htm:files:load[path] # Load a markdown file into memory
43
+ # rake htm:files:load_dir[path] # Load all markdown files from a directory
44
+ # rake htm:files:list # List all loaded file sources
45
+ # rake htm:files:info[path] # Show details for a loaded file
46
+ # rake htm:files:unload[path] # Unload a file from memory
47
+ # rake htm:files:sync # Sync all loaded files (reload changed files)
48
+ # rake htm:files:stats # Show file loading statistics
49
+ #
50
+ # Documentation tasks:
51
+ # rake htm:doc:yard # Build YARD API documentation website
52
+ # rake htm:doc:server # Start YARD documentation server (live reload)
53
+ # rake htm:doc:server[port] # Start server on custom port
54
+ # rake htm:doc:stats # Show documentation coverage statistics
55
+ # rake htm:doc:clean # Clean generated documentation
56
+ #
31
57
 
32
58
  if defined?(Rake)
33
59
  # Load the rake tasks
34
60
  load File.expand_path('../tasks/htm.rake', __dir__)
35
61
  load File.expand_path('../tasks/jobs.rake', __dir__)
62
+ load File.expand_path('../tasks/tags.rake', __dir__)
63
+ load File.expand_path('../tasks/files.rake', __dir__)
64
+ load File.expand_path('../tasks/doc.rake', __dir__)
36
65
  else
37
66
  warn "HTM tasks not loaded: Rake is not available"
38
67
  end