htm 0.0.17 → 0.0.18

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (80) hide show
  1. checksums.yaml +4 -4
  2. data/.architecture/decisions/adrs/001-use-postgresql-timescaledb-storage.md +1 -1
  3. data/.architecture/decisions/adrs/011-database-side-embedding-generation-with-pgai.md +4 -4
  4. data/.architecture/decisions/adrs/012-llm-driven-ontology-topic-extraction.md +1 -1
  5. data/.envrc +12 -25
  6. data/.irbrc +7 -7
  7. data/.tbls.yml +2 -2
  8. data/CHANGELOG.md +71 -0
  9. data/README.md +1 -1
  10. data/Rakefile +8 -3
  11. data/SETUP.md +12 -12
  12. data/bin/htm_mcp +0 -4
  13. data/db/seed_data/README.md +2 -2
  14. data/db/seeds.rb +2 -2
  15. data/docs/api/database.md +37 -37
  16. data/docs/api/htm.md +1 -1
  17. data/docs/api/yard/HTM/ActiveRecordConfig.md +2 -2
  18. data/docs/api/yard/HTM/Configuration.md +26 -15
  19. data/docs/api/yard/HTM/Database.md +7 -8
  20. data/docs/api/yard/HTM/JobAdapter.md +1 -1
  21. data/docs/api/yard/HTM/Railtie.md +2 -2
  22. data/docs/architecture/adrs/001-postgresql-timescaledb.md +1 -1
  23. data/docs/architecture/adrs/011-pgai-integration.md +4 -4
  24. data/docs/database_rake_tasks.md +5 -5
  25. data/docs/development/rake-tasks.md +11 -11
  26. data/docs/development/setup.md +21 -21
  27. data/docs/development/testing.md +1 -1
  28. data/docs/getting-started/installation.md +20 -20
  29. data/docs/getting-started/quick-start.md +12 -12
  30. data/docs/guides/getting-started.md +2 -2
  31. data/docs/guides/long-term-memory.md +1 -1
  32. data/docs/guides/mcp-server.md +17 -17
  33. data/docs/guides/robot-groups.md +8 -8
  34. data/docs/index.md +4 -4
  35. data/docs/multi_framework_support.md +8 -8
  36. data/docs/setup_local_database.md +19 -19
  37. data/docs/using_rake_tasks_in_your_app.md +14 -14
  38. data/examples/README.md +50 -6
  39. data/examples/basic_usage.rb +31 -21
  40. data/examples/cli_app/README.md +8 -8
  41. data/examples/cli_app/htm_cli.rb +5 -5
  42. data/examples/config_file_example/README.md +256 -0
  43. data/examples/config_file_example/config/htm.local.yml +34 -0
  44. data/examples/config_file_example/custom_config.yml +22 -0
  45. data/examples/config_file_example/show_config.rb +125 -0
  46. data/examples/custom_llm_configuration.rb +7 -7
  47. data/examples/example_app/Rakefile +2 -2
  48. data/examples/example_app/app.rb +8 -8
  49. data/examples/file_loader_usage.rb +9 -9
  50. data/examples/mcp_client.rb +5 -5
  51. data/examples/rails_app/Gemfile.lock +48 -56
  52. data/examples/rails_app/README.md +1 -1
  53. data/examples/robot_groups/multi_process.rb +5 -5
  54. data/examples/robot_groups/robot_worker.rb +5 -5
  55. data/examples/robot_groups/same_process.rb +9 -9
  56. data/examples/sinatra_app/app.rb +1 -1
  57. data/examples/timeframe_demo.rb +1 -1
  58. data/lib/htm/active_record_config.rb +12 -25
  59. data/lib/htm/circuit_breaker.rb +0 -2
  60. data/lib/htm/config/defaults.yml +246 -0
  61. data/lib/htm/config.rb +888 -0
  62. data/lib/htm/database.rb +23 -27
  63. data/lib/htm/embedding_service.rb +0 -4
  64. data/lib/htm/integrations/sinatra.rb +3 -7
  65. data/lib/htm/job_adapter.rb +1 -15
  66. data/lib/htm/jobs/generate_embedding_job.rb +1 -7
  67. data/lib/htm/jobs/generate_propositions_job.rb +2 -12
  68. data/lib/htm/jobs/generate_tags_job.rb +1 -8
  69. data/lib/htm/loaders/defaults_loader.rb +143 -0
  70. data/lib/htm/loaders/xdg_config_loader.rb +116 -0
  71. data/lib/htm/mcp/cli.rb +200 -58
  72. data/lib/htm/mcp/server.rb +3 -3
  73. data/lib/htm/proposition_service.rb +2 -12
  74. data/lib/htm/railtie.rb +3 -4
  75. data/lib/htm/tag_service.rb +1 -8
  76. data/lib/htm/version.rb +1 -1
  77. data/lib/htm.rb +124 -5
  78. metadata +24 -4
  79. data/config/database.yml +0 -77
  80. data/lib/htm/configuration.rb +0 -799
data/lib/htm/config.rb ADDED
@@ -0,0 +1,888 @@
1
+ # frozen_string_literal: true
2
+
3
+ require 'anyway_config'
4
+ require 'logger'
5
+ require 'yaml'
6
+
7
+ class HTM
8
+ # ConfigSection provides method access to nested configuration hashes
9
+ #
10
+ # @example
11
+ # section = ConfigSection.new(host: 'localhost', port: 5432)
12
+ # section.host # => 'localhost'
13
+ # section.port # => 5432
14
+ #
15
+ class ConfigSection
16
+ def initialize(hash = {})
17
+ @data = {}
18
+ (hash || {}).each do |key, value|
19
+ @data[key.to_sym] = value.is_a?(Hash) ? ConfigSection.new(value) : value
20
+ end
21
+ end
22
+
23
+ def method_missing(method, *args, &block)
24
+ key = method.to_s
25
+ if key.end_with?('=')
26
+ @data[key.chomp('=').to_sym] = args.first
27
+ elsif @data.key?(method)
28
+ @data[method]
29
+ else
30
+ nil
31
+ end
32
+ end
33
+
34
+ def respond_to_missing?(method, include_private = false)
35
+ key = method.to_s.chomp('=').to_sym
36
+ @data.key?(key) || super
37
+ end
38
+
39
+ def to_h
40
+ @data.transform_values do |v|
41
+ v.is_a?(ConfigSection) ? v.to_h : v
42
+ end
43
+ end
44
+
45
+ def [](key)
46
+ @data[key.to_sym]
47
+ end
48
+
49
+ def []=(key, value)
50
+ @data[key.to_sym] = value
51
+ end
52
+
53
+ def merge(other)
54
+ other_hash = other.is_a?(ConfigSection) ? other.to_h : other
55
+ ConfigSection.new(deep_merge(to_h, other_hash || {}))
56
+ end
57
+
58
+ def keys
59
+ @data.keys
60
+ end
61
+
62
+ def each(&block)
63
+ @data.each(&block)
64
+ end
65
+
66
+ private
67
+
68
+ def deep_merge(base, overlay)
69
+ base.merge(overlay) do |_key, old_val, new_val|
70
+ if old_val.is_a?(Hash) && new_val.is_a?(Hash)
71
+ deep_merge(old_val, new_val)
72
+ else
73
+ new_val
74
+ end
75
+ end
76
+ end
77
+ end
78
+
79
+ # HTM Configuration using Anyway Config
80
+ #
81
+ # Schema is defined in lib/htm/config/defaults.yml (single source of truth)
82
+ # Configuration uses nested sections for better organization:
83
+ # - HTM.config.database.host
84
+ # - HTM.config.embedding.provider
85
+ # - HTM.config.providers.openai.api_key
86
+ #
87
+ # Configuration sources (lowest to highest priority):
88
+ # 1. Bundled defaults: lib/htm/config/defaults.yml (ships with gem)
89
+ # 2. XDG user config:
90
+ # - ~/Library/Application Support/htm/htm.yml (macOS only)
91
+ # - ~/.config/htm/htm.yml (XDG default)
92
+ # - $XDG_CONFIG_HOME/htm/htm.yml (if XDG_CONFIG_HOME is set)
93
+ # 3. Project config: ./config/htm.yml (environment-specific)
94
+ # 4. Local overrides: ./config/htm.local.yml (gitignored)
95
+ # 5. Environment variables (HTM_*)
96
+ # 6. Explicit values passed to configure block
97
+ #
98
+ # @example Configure with environment variables
99
+ # export HTM_EMBEDDING__PROVIDER=openai
100
+ # export HTM_EMBEDDING__MODEL=text-embedding-3-small
101
+ # export HTM_PROVIDERS__OPENAI__API_KEY=sk-xxx
102
+ #
103
+ # @example Configure with XDG user config (~/.config/htm/htm.yml)
104
+ # embedding:
105
+ # provider: ollama
106
+ # model: nomic-embed-text:latest
107
+ # providers:
108
+ # ollama:
109
+ # url: http://localhost:11434
110
+ #
111
+ # @example Configure with Ruby block
112
+ # HTM.configure do |config|
113
+ # config.embedding.provider = :openai
114
+ # config.embedding.model = 'text-embedding-3-small'
115
+ # end
116
+ #
117
+ class Config < Anyway::Config
118
+ config_name :htm
119
+ env_prefix :htm
120
+
121
+ # ==========================================================================
122
+ # Schema Definition (loaded from defaults.yml - single source of truth)
123
+ # ==========================================================================
124
+
125
+ # Path to bundled defaults file (defines both schema and default values)
126
+ DEFAULTS_PATH = File.expand_path('config/defaults.yml', __dir__).freeze
127
+
128
+ # Load schema from defaults.yml at class definition time
129
+ begin
130
+ defaults_content = File.read(DEFAULTS_PATH)
131
+ raw_yaml = YAML.safe_load(
132
+ defaults_content,
133
+ permitted_classes: [Symbol],
134
+ symbolize_names: true,
135
+ aliases: true
136
+ ) || {}
137
+ SCHEMA = raw_yaml[:defaults] || {}
138
+ rescue StandardError => e
139
+ warn "HTM: Could not load schema from #{DEFAULTS_PATH}: #{e.message}"
140
+ SCHEMA = {}
141
+ end
142
+
143
+ # Nested section attributes (defined as hashes, converted to ConfigSection)
144
+ attr_config :database, :service, :embedding, :tag, :proposition,
145
+ :chunking, :circuit_breaker, :relevance, :job, :providers
146
+
147
+ # Top-level scalar attributes
148
+ attr_config :week_start, :connection_timeout, :telemetry_enabled, :log_level
149
+
150
+ # Custom environment detection: HTM_ENV > RAILS_ENV > RACK_ENV > 'development'
151
+ class << self
152
+ def env
153
+ Anyway::Settings.current_environment ||
154
+ ENV['HTM_ENV'] ||
155
+ ENV['RAILS_ENV'] ||
156
+ ENV['RACK_ENV'] ||
157
+ 'development'
158
+ end
159
+ end
160
+
161
+ # ==========================================================================
162
+ # Type Coercion
163
+ # ==========================================================================
164
+
165
+ TO_SYMBOL = ->(v) { v.nil? ? nil : v.to_s.to_sym }
166
+
167
+ # Create a coercion that merges incoming value with SCHEMA defaults for a section.
168
+ # This ensures env vars like HTM_DATABASE__URL don't lose other defaults.
169
+ def self.config_section_with_defaults(section_key)
170
+ defaults = SCHEMA[section_key] || {}
171
+ ->(v) {
172
+ return v if v.is_a?(ConfigSection)
173
+ incoming = v || {}
174
+ # Deep merge: defaults first, then overlay incoming values
175
+ merged = deep_merge_hashes(defaults.dup, incoming)
176
+ ConfigSection.new(merged)
177
+ }
178
+ end
179
+
180
+ # Deep merge helper for coercion
181
+ def self.deep_merge_hashes(base, overlay)
182
+ base.merge(overlay) do |_key, old_val, new_val|
183
+ if old_val.is_a?(Hash) && new_val.is_a?(Hash)
184
+ deep_merge_hashes(old_val, new_val)
185
+ else
186
+ new_val.nil? ? old_val : new_val
187
+ end
188
+ end
189
+ end
190
+
191
+ coerce_types(
192
+ # Nested sections -> ConfigSection objects (with SCHEMA defaults merged)
193
+ database: config_section_with_defaults(:database),
194
+ service: config_section_with_defaults(:service),
195
+ embedding: config_section_with_defaults(:embedding),
196
+ tag: config_section_with_defaults(:tag),
197
+ proposition: config_section_with_defaults(:proposition),
198
+ chunking: config_section_with_defaults(:chunking),
199
+ circuit_breaker: config_section_with_defaults(:circuit_breaker),
200
+ relevance: config_section_with_defaults(:relevance),
201
+ job: config_section_with_defaults(:job),
202
+ providers: config_section_with_defaults(:providers),
203
+
204
+ # Top-level symbols
205
+ week_start: TO_SYMBOL,
206
+ log_level: TO_SYMBOL,
207
+
208
+ # Top-level integers
209
+ connection_timeout: :integer,
210
+
211
+ # Top-level booleans
212
+ telemetry_enabled: :boolean
213
+ )
214
+
215
+ # ==========================================================================
216
+ # Validation
217
+ # ==========================================================================
218
+
219
+ SUPPORTED_PROVIDERS = %i[
220
+ openai anthropic gemini azure ollama
221
+ huggingface openrouter bedrock deepseek
222
+ ].freeze
223
+
224
+ SUPPORTED_JOB_BACKENDS = %i[active_job sidekiq inline thread].freeze
225
+ SUPPORTED_WEEK_STARTS = %i[sunday monday].freeze
226
+
227
+ # Default embedding dimensions by provider
228
+ DEFAULT_DIMENSIONS = {
229
+ openai: 1536,
230
+ anthropic: 1024,
231
+ gemini: 768,
232
+ azure: 1536,
233
+ ollama: 768,
234
+ huggingface: 768,
235
+ openrouter: 1536,
236
+ bedrock: 1536,
237
+ deepseek: 1536
238
+ }.freeze
239
+
240
+ on_load :coerce_nested_types, :validate_config, :setup_defaults
241
+
242
+ # ==========================================================================
243
+ # Callable Accessors (not loaded from config sources)
244
+ # ==========================================================================
245
+
246
+ attr_accessor :embedding_generator, :tag_extractor, :proposition_extractor
247
+ attr_accessor :token_counter, :logger
248
+
249
+ # ==========================================================================
250
+ # Instance Methods
251
+ # ==========================================================================
252
+
253
+ def initialize(...)
254
+ super
255
+ @ollama_models_refreshed = false
256
+ @ollama_refresh_mutex = Mutex.new
257
+ end
258
+
259
+ # ==========================================================================
260
+ # Convenience Accessors (for common nested values)
261
+ # ==========================================================================
262
+
263
+ # Database convenience methods
264
+ def database_url
265
+ url = database.url
266
+ return url if url && !url.empty?
267
+
268
+ build_database_url
269
+ end
270
+
271
+ def database_config
272
+ url = database_url
273
+ return {} unless url
274
+
275
+ require 'uri'
276
+ uri = URI.parse(url)
277
+
278
+ # Coercion now merges env vars with SCHEMA defaults, so pool_size/timeout
279
+ # are always available even when only HTM_DATABASE__URL is set
280
+ {
281
+ adapter: 'postgresql',
282
+ host: uri.host,
283
+ port: uri.port || 5432,
284
+ database: uri.path&.sub(%r{^/}, ''),
285
+ username: uri.user,
286
+ password: uri.password,
287
+ pool: database.pool_size.to_i,
288
+ timeout: database.timeout.to_i,
289
+ sslmode: database.sslmode,
290
+ encoding: 'unicode',
291
+ prepared_statements: false,
292
+ advisory_locks: false
293
+ }.compact
294
+ end
295
+
296
+ def database_configured?
297
+ url = database_url
298
+ (url && !url.empty?) || (database.name && !database.name.empty?)
299
+ end
300
+
301
+ # Embedding convenience accessors
302
+ def embedding_provider
303
+ provider = embedding.provider
304
+ provider.is_a?(Symbol) ? provider : provider&.to_sym
305
+ end
306
+
307
+ def embedding_model
308
+ embedding.model
309
+ end
310
+
311
+ def embedding_dimensions
312
+ embedding.dimensions.to_i
313
+ end
314
+
315
+ def embedding_timeout
316
+ embedding.timeout.to_i
317
+ end
318
+
319
+ def max_embedding_dimension
320
+ embedding.max_dimension.to_i
321
+ end
322
+
323
+ # Tag convenience accessors
324
+ def tag_provider
325
+ provider = tag.provider
326
+ provider.is_a?(Symbol) ? provider : provider&.to_sym
327
+ end
328
+
329
+ def tag_model
330
+ tag.model
331
+ end
332
+
333
+ def tag_timeout
334
+ tag.timeout.to_i
335
+ end
336
+
337
+ def max_tag_depth
338
+ tag.max_depth.to_i
339
+ end
340
+
341
+ # Proposition convenience accessors
342
+ def proposition_provider
343
+ provider = proposition.provider
344
+ provider.is_a?(Symbol) ? provider : provider&.to_sym
345
+ end
346
+
347
+ def proposition_model
348
+ proposition.model
349
+ end
350
+
351
+ def proposition_timeout
352
+ proposition.timeout.to_i
353
+ end
354
+
355
+ def extract_propositions
356
+ proposition.enabled
357
+ end
358
+
359
+ # Chunking convenience accessors
360
+ def chunk_size
361
+ chunking.size.to_i
362
+ end
363
+
364
+ def chunk_overlap
365
+ chunking.overlap.to_i
366
+ end
367
+
368
+ # Circuit breaker convenience accessors
369
+ def circuit_breaker_failure_threshold
370
+ circuit_breaker.failure_threshold.to_i
371
+ end
372
+
373
+ def circuit_breaker_reset_timeout
374
+ circuit_breaker.reset_timeout.to_i
375
+ end
376
+
377
+ def circuit_breaker_half_open_max_calls
378
+ circuit_breaker.half_open_max_calls.to_i
379
+ end
380
+
381
+ # Relevance scoring convenience accessors
382
+ def relevance_semantic_weight
383
+ relevance.semantic_weight.to_f
384
+ end
385
+
386
+ def relevance_tag_weight
387
+ relevance.tag_weight.to_f
388
+ end
389
+
390
+ def relevance_recency_weight
391
+ relevance.recency_weight.to_f
392
+ end
393
+
394
+ def relevance_access_weight
395
+ relevance.access_weight.to_f
396
+ end
397
+
398
+ def relevance_recency_half_life_hours
399
+ relevance.recency_half_life_hours.to_f
400
+ end
401
+
402
+ # Job backend convenience accessor
403
+ def job_backend
404
+ backend = job.backend
405
+ return nil if backend.nil?
406
+
407
+ backend.is_a?(Symbol) ? backend : backend.to_sym
408
+ end
409
+
410
+ # Service name convenience accessor
411
+ def service_name
412
+ service.name
413
+ end
414
+
415
+ # Provider credential convenience accessors
416
+ def openai_api_key
417
+ providers.openai&.api_key
418
+ end
419
+
420
+ def openai_organization
421
+ providers.openai&.organization
422
+ end
423
+
424
+ def openai_project
425
+ providers.openai&.project
426
+ end
427
+
428
+ def anthropic_api_key
429
+ providers.anthropic&.api_key
430
+ end
431
+
432
+ def gemini_api_key
433
+ providers.gemini&.api_key
434
+ end
435
+
436
+ def azure_api_key
437
+ providers.azure&.api_key
438
+ end
439
+
440
+ def azure_endpoint
441
+ providers.azure&.endpoint
442
+ end
443
+
444
+ def azure_api_version
445
+ providers.azure&.api_version
446
+ end
447
+
448
+ def ollama_url
449
+ providers.ollama&.url || 'http://localhost:11434'
450
+ end
451
+
452
+ def huggingface_api_key
453
+ providers.huggingface&.api_key
454
+ end
455
+
456
+ def openrouter_api_key
457
+ providers.openrouter&.api_key
458
+ end
459
+
460
+ def bedrock_access_key
461
+ providers.bedrock&.access_key
462
+ end
463
+
464
+ def bedrock_secret_key
465
+ providers.bedrock&.secret_key
466
+ end
467
+
468
+ def bedrock_region
469
+ providers.bedrock&.region || 'us-east-1'
470
+ end
471
+
472
+ def deepseek_api_key
473
+ providers.deepseek&.api_key
474
+ end
475
+
476
+ # ==========================================================================
477
+ # Environment Helpers
478
+ # ==========================================================================
479
+
480
+ def test?
481
+ self.class.env == 'test'
482
+ end
483
+
484
+ def development?
485
+ self.class.env == 'development'
486
+ end
487
+
488
+ def production?
489
+ self.class.env == 'production'
490
+ end
491
+
492
+ def environment
493
+ self.class.env
494
+ end
495
+
496
+ # ==========================================================================
497
+ # XDG Config Path Helpers
498
+ # ==========================================================================
499
+
500
+ def self.xdg_config_paths
501
+ HTM::Loaders::XdgConfigLoader.config_paths
502
+ end
503
+
504
+ def self.xdg_config_file
505
+ xdg_home = ENV['XDG_CONFIG_HOME']
506
+ base = if xdg_home && !xdg_home.empty?
507
+ xdg_home
508
+ else
509
+ File.expand_path('~/.config')
510
+ end
511
+ File.join(base, 'htm', 'htm.yml')
512
+ end
513
+
514
+ def self.active_xdg_config_file
515
+ HTM::Loaders::XdgConfigLoader.find_config_file('htm')
516
+ end
517
+
518
+ # ==========================================================================
519
+ # Ollama Helpers
520
+ # ==========================================================================
521
+
522
+ def normalize_ollama_model(model_name)
523
+ return model_name if model_name.nil? || model_name.empty?
524
+ return model_name if model_name.include?(':')
525
+
526
+ "#{model_name}:latest"
527
+ end
528
+
529
+ def configure_ruby_llm(provider = nil)
530
+ require 'ruby_llm'
531
+
532
+ provider ||= embedding_provider
533
+
534
+ RubyLLM.configure do |config|
535
+ case provider
536
+ when :openai
537
+ config.openai_api_key = openai_api_key if openai_api_key
538
+ config.openai_organization = openai_organization if openai_organization && config.respond_to?(:openai_organization=)
539
+ config.openai_project = openai_project if openai_project && config.respond_to?(:openai_project=)
540
+ when :anthropic
541
+ config.anthropic_api_key = anthropic_api_key if anthropic_api_key
542
+ when :gemini
543
+ config.gemini_api_key = gemini_api_key if gemini_api_key
544
+ when :azure
545
+ config.azure_api_key = azure_api_key if azure_api_key && config.respond_to?(:azure_api_key=)
546
+ config.azure_endpoint = azure_endpoint if azure_endpoint && config.respond_to?(:azure_endpoint=)
547
+ config.azure_api_version = azure_api_version if azure_api_version && config.respond_to?(:azure_api_version=)
548
+ when :ollama
549
+ ollama_api_base = if ollama_url.end_with?('/v1') || ollama_url.end_with?('/v1/')
550
+ ollama_url.sub(%r{/+$}, '')
551
+ else
552
+ "#{ollama_url.sub(%r{/+$}, '')}/v1"
553
+ end
554
+ config.ollama_api_base = ollama_api_base
555
+ when :huggingface
556
+ config.huggingface_api_key = huggingface_api_key if huggingface_api_key && config.respond_to?(:huggingface_api_key=)
557
+ when :openrouter
558
+ config.openrouter_api_key = openrouter_api_key if openrouter_api_key && config.respond_to?(:openrouter_api_key=)
559
+ when :bedrock
560
+ config.bedrock_api_key = bedrock_access_key if bedrock_access_key && config.respond_to?(:bedrock_api_key=)
561
+ config.bedrock_secret_key = bedrock_secret_key if bedrock_secret_key && config.respond_to?(:bedrock_secret_key=)
562
+ config.bedrock_region = bedrock_region if bedrock_region && config.respond_to?(:bedrock_region=)
563
+ when :deepseek
564
+ config.deepseek_api_key = deepseek_api_key if deepseek_api_key && config.respond_to?(:deepseek_api_key=)
565
+ end
566
+ end
567
+ end
568
+
569
+ def refresh_ollama_models!
570
+ @ollama_refresh_mutex.synchronize do
571
+ unless @ollama_models_refreshed
572
+ require 'ruby_llm'
573
+ RubyLLM.models.refresh!
574
+ @ollama_models_refreshed = true
575
+ end
576
+ end
577
+ end
578
+
579
+ def reset_to_defaults
580
+ @embedding_generator = build_default_embedding_generator
581
+ @tag_extractor = build_default_tag_extractor
582
+ @proposition_extractor = build_default_proposition_extractor
583
+ @token_counter = build_default_token_counter
584
+ @logger = build_default_logger
585
+ end
586
+
587
+ def validate!
588
+ validate_callables
589
+ validate_logger
590
+ end
591
+
592
+ def validate_settings!
593
+ validate_providers
594
+ validate_job_backend
595
+ validate_week_start
596
+ validate_relevance_weights
597
+ end
598
+
599
+ private
600
+
601
+ def build_database_url
602
+ return nil unless database.name && !database.name.empty?
603
+
604
+ auth = if database.user && !database.user.empty?
605
+ database.password && !database.password.empty? ? "#{database.user}:#{database.password}@" : "#{database.user}@"
606
+ else
607
+ ''
608
+ end
609
+
610
+ "postgresql://#{auth}#{database.host}:#{database.port}/#{database.name}"
611
+ end
612
+
613
+ # ==========================================================================
614
+ # Type Coercion Callback
615
+ # ==========================================================================
616
+
617
+ def coerce_nested_types
618
+ # Ensure nested provider sections are ConfigSections
619
+ if providers.is_a?(ConfigSection)
620
+ %i[openai anthropic gemini azure ollama huggingface openrouter bedrock deepseek].each do |provider|
621
+ value = providers[provider]
622
+ providers[provider] = ConfigSection.new(value) if value.is_a?(Hash)
623
+ end
624
+ end
625
+ end
626
+
627
+ # ==========================================================================
628
+ # Validation Callbacks
629
+ # ==========================================================================
630
+
631
+ def validate_config
632
+ validate_providers
633
+ validate_job_backend
634
+ validate_week_start
635
+ validate_relevance_weights
636
+ end
637
+
638
+ def validate_providers
639
+ validate_provider(:embedding_provider, embedding_provider)
640
+ validate_provider(:tag_provider, tag_provider)
641
+ validate_provider(:proposition_provider, proposition_provider)
642
+ end
643
+
644
+ def validate_provider(name, value)
645
+ return if value.nil?
646
+
647
+ unless SUPPORTED_PROVIDERS.include?(value)
648
+ raise_validation_error("#{name} must be one of: #{SUPPORTED_PROVIDERS.join(', ')} (got #{value.inspect})")
649
+ end
650
+ end
651
+
652
+ def validate_job_backend
653
+ return unless job_backend
654
+
655
+ unless SUPPORTED_JOB_BACKENDS.include?(job_backend)
656
+ raise_validation_error("job.backend must be one of: #{SUPPORTED_JOB_BACKENDS.join(', ')} (got #{job_backend.inspect})")
657
+ end
658
+ end
659
+
660
+ def validate_week_start
661
+ unless SUPPORTED_WEEK_STARTS.include?(week_start)
662
+ raise_validation_error("week_start must be one of: #{SUPPORTED_WEEK_STARTS.join(', ')} (got #{week_start.inspect})")
663
+ end
664
+ end
665
+
666
+ def validate_relevance_weights
667
+ total = relevance_semantic_weight + relevance_tag_weight +
668
+ relevance_recency_weight + relevance_access_weight
669
+
670
+ unless (0.99..1.01).cover?(total)
671
+ raise_validation_error("relevance weights must sum to 1.0 (got #{total})")
672
+ end
673
+ end
674
+
675
+ def validate_callables
676
+ unless @embedding_generator.respond_to?(:call)
677
+ raise HTM::ValidationError, "embedding_generator must be callable"
678
+ end
679
+
680
+ unless @tag_extractor.respond_to?(:call)
681
+ raise HTM::ValidationError, "tag_extractor must be callable"
682
+ end
683
+
684
+ unless @proposition_extractor.respond_to?(:call)
685
+ raise HTM::ValidationError, "proposition_extractor must be callable"
686
+ end
687
+
688
+ unless @token_counter.respond_to?(:call)
689
+ raise HTM::ValidationError, "token_counter must be callable"
690
+ end
691
+ end
692
+
693
+ def validate_logger
694
+ unless @logger.respond_to?(:info) && @logger.respond_to?(:warn) && @logger.respond_to?(:error)
695
+ raise HTM::ValidationError, "logger must respond to :info, :warn, and :error"
696
+ end
697
+ end
698
+
699
+ # ==========================================================================
700
+ # Setup Defaults Callback
701
+ # ==========================================================================
702
+
703
+ def setup_defaults
704
+ job.backend = detect_job_backend if job_backend.nil?
705
+ @logger ||= build_default_logger
706
+ @embedding_generator ||= build_default_embedding_generator
707
+ @tag_extractor ||= build_default_tag_extractor
708
+ @proposition_extractor ||= build_default_proposition_extractor
709
+ @token_counter ||= build_default_token_counter
710
+ end
711
+
712
+ def detect_job_backend
713
+ return :inline if test?
714
+ return :active_job if defined?(ActiveJob)
715
+ return :sidekiq if defined?(Sidekiq)
716
+
717
+ :thread
718
+ end
719
+
720
+ def build_default_logger
721
+ logger = Logger.new($stdout)
722
+ logger.level = log_level
723
+ logger.formatter = proc do |severity, datetime, _progname, msg|
724
+ "[#{datetime.strftime('%Y-%m-%d %H:%M:%S')}] #{severity} -- HTM: #{msg}\n"
725
+ end
726
+ logger
727
+ end
728
+
729
+ def build_default_token_counter
730
+ lambda do |text|
731
+ require 'tiktoken_ruby' unless defined?(Tiktoken)
732
+ encoder = Tiktoken.encoding_for_model("gpt-3.5-turbo")
733
+ encoder.encode(text).length
734
+ end
735
+ end
736
+
737
+ def build_default_embedding_generator
738
+ lambda do |text|
739
+ require 'ruby_llm' unless defined?(RubyLLM)
740
+
741
+ configure_ruby_llm(embedding_provider)
742
+ refresh_ollama_models! if embedding_provider == :ollama
743
+
744
+ model = embedding_provider == :ollama ? normalize_ollama_model(embedding_model) : embedding_model
745
+ response = RubyLLM.embed(text, model: model)
746
+ embedding = extract_embedding_from_response(response)
747
+
748
+ unless embedding.is_a?(Array) && embedding.all? { |v| v.is_a?(Numeric) }
749
+ raise HTM::EmbeddingError, "Invalid embedding response format from #{embedding_provider}"
750
+ end
751
+
752
+ embedding
753
+ end
754
+ end
755
+
756
+ def build_default_tag_extractor
757
+ lambda do |text, existing_ontology = []|
758
+ require 'ruby_llm' unless defined?(RubyLLM)
759
+
760
+ configure_ruby_llm(tag_provider)
761
+ refresh_ollama_models! if tag_provider == :ollama
762
+
763
+ model = tag_provider == :ollama ? normalize_ollama_model(tag_model) : tag_model
764
+
765
+ prompt = build_tag_extraction_prompt(text, existing_ontology)
766
+ system_prompt = build_tag_system_prompt
767
+
768
+ chat = RubyLLM.chat(model: model)
769
+ chat.with_instructions(system_prompt)
770
+ response = chat.ask(prompt)
771
+
772
+ parse_tag_response(extract_text_from_response(response))
773
+ end
774
+ end
775
+
776
+ def build_default_proposition_extractor
777
+ lambda do |text|
778
+ require 'ruby_llm' unless defined?(RubyLLM)
779
+
780
+ configure_ruby_llm(proposition_provider)
781
+ refresh_ollama_models! if proposition_provider == :ollama
782
+
783
+ model = proposition_provider == :ollama ? normalize_ollama_model(proposition_model) : proposition_model
784
+
785
+ prompt = build_proposition_extraction_prompt(text)
786
+ system_prompt = build_proposition_system_prompt
787
+
788
+ chat = RubyLLM.chat(model: model)
789
+ chat.with_instructions(system_prompt)
790
+ response = chat.ask(prompt)
791
+
792
+ parse_proposition_response(extract_text_from_response(response))
793
+ end
794
+ end
795
+
796
+ # ==========================================================================
797
+ # Response Extraction Helpers
798
+ # ==========================================================================
799
+
800
+ def extract_embedding_from_response(response)
801
+ return nil unless response
802
+
803
+ case response
804
+ when Array
805
+ response
806
+ when ->(r) { r.respond_to?(:vectors) }
807
+ vectors = response.vectors
808
+ vectors.is_a?(Array) && vectors.first.is_a?(Array) ? vectors.first : vectors
809
+ when ->(r) { r.respond_to?(:to_a) }
810
+ response.to_a
811
+ when ->(r) { r.respond_to?(:embedding) }
812
+ response.embedding
813
+ else
814
+ if response.respond_to?(:instance_variable_get)
815
+ vectors = response.instance_variable_get(:@vectors)
816
+ return vectors.first if vectors.is_a?(Array) && vectors.first.is_a?(Array)
817
+ return vectors if vectors.is_a?(Array)
818
+ end
819
+ raise HTM::EmbeddingError, "Cannot extract embedding from response: #{response.class}"
820
+ end
821
+ end
822
+
823
+ def extract_text_from_response(response)
824
+ return '' unless response
825
+
826
+ case response
827
+ when String then response
828
+ when ->(r) { r.respond_to?(:content) } then response.content.to_s
829
+ when ->(r) { r.respond_to?(:text) } then response.text.to_s
830
+ else response.to_s
831
+ end
832
+ end
833
+
834
+ def parse_tag_response(text)
835
+ tags = text.to_s.split("\n").map(&:strip).reject(&:empty?)
836
+ valid_tags = tags.select { |tag| tag =~ /^[a-z0-9\-]+(:[a-z0-9\-]+)*$/ }
837
+ valid_tags.select { |tag| tag.count(':') < max_tag_depth }
838
+ end
839
+
840
+ def parse_proposition_response(text)
841
+ text.to_s
842
+ .split("\n")
843
+ .map(&:strip)
844
+ .map { |line| line.sub(/^[-*]\s*/, '') }
845
+ .map(&:strip)
846
+ .reject(&:empty?)
847
+ end
848
+
849
+ # ==========================================================================
850
+ # Prompt Builders
851
+ #
852
+ # These methods use configurable prompt templates from defaults.yml.
853
+ # Templates use %{placeholder} syntax for runtime interpolation.
854
+ # ==========================================================================
855
+
856
+ def build_tag_extraction_prompt(text, existing_ontology)
857
+ taxonomy_context = if existing_ontology.any?
858
+ sample_tags = existing_ontology.sample([existing_ontology.size, 20].min)
859
+ tag.taxonomy_context_existing % { sample_tags: sample_tags.join(', ') }
860
+ else
861
+ tag.taxonomy_context_empty
862
+ end
863
+
864
+ tag.user_prompt_template % {
865
+ text: text,
866
+ max_depth: max_tag_depth,
867
+ taxonomy_context: taxonomy_context
868
+ }
869
+ end
870
+
871
+ def build_tag_system_prompt
872
+ tag.system_prompt.to_s.strip
873
+ end
874
+
875
+ def build_proposition_extraction_prompt(text)
876
+ proposition.user_prompt_template % { text: text }
877
+ end
878
+
879
+ def build_proposition_system_prompt
880
+ proposition.system_prompt.to_s.strip
881
+ end
882
+ end
883
+ end
884
+
885
+ # Register custom loaders after Config class is defined
886
+ # Order matters: defaults (lowest priority) -> XDG -> project config -> ENV (highest)
887
+ require_relative 'loaders/defaults_loader'
888
+ require_relative 'loaders/xdg_config_loader'