fact_db 0.0.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (89) hide show
  1. checksums.yaml +7 -0
  2. data/.envrc +1 -0
  3. data/CHANGELOG.md +48 -0
  4. data/COMMITS.md +196 -0
  5. data/README.md +102 -0
  6. data/Rakefile +41 -0
  7. data/db/migrate/001_enable_extensions.rb +7 -0
  8. data/db/migrate/002_create_contents.rb +44 -0
  9. data/db/migrate/003_create_entities.rb +36 -0
  10. data/db/migrate/004_create_entity_aliases.rb +18 -0
  11. data/db/migrate/005_create_facts.rb +65 -0
  12. data/db/migrate/006_create_entity_mentions.rb +18 -0
  13. data/db/migrate/007_create_fact_sources.rb +18 -0
  14. data/docs/api/extractors/index.md +71 -0
  15. data/docs/api/extractors/llm.md +162 -0
  16. data/docs/api/extractors/manual.md +92 -0
  17. data/docs/api/extractors/rule-based.md +165 -0
  18. data/docs/api/facts.md +300 -0
  19. data/docs/api/index.md +66 -0
  20. data/docs/api/models/content.md +165 -0
  21. data/docs/api/models/entity.md +202 -0
  22. data/docs/api/models/fact.md +270 -0
  23. data/docs/api/models/index.md +77 -0
  24. data/docs/api/pipeline/extraction.md +175 -0
  25. data/docs/api/pipeline/index.md +72 -0
  26. data/docs/api/pipeline/resolution.md +209 -0
  27. data/docs/api/services/content-service.md +166 -0
  28. data/docs/api/services/entity-service.md +202 -0
  29. data/docs/api/services/fact-service.md +223 -0
  30. data/docs/api/services/index.md +55 -0
  31. data/docs/architecture/database-schema.md +293 -0
  32. data/docs/architecture/entity-resolution.md +293 -0
  33. data/docs/architecture/index.md +149 -0
  34. data/docs/architecture/temporal-facts.md +268 -0
  35. data/docs/architecture/three-layer-model.md +242 -0
  36. data/docs/assets/css/custom.css +137 -0
  37. data/docs/assets/fact_db.jpg +0 -0
  38. data/docs/assets/images/fact_db.jpg +0 -0
  39. data/docs/concepts.md +183 -0
  40. data/docs/examples/basic-usage.md +235 -0
  41. data/docs/examples/hr-onboarding.md +312 -0
  42. data/docs/examples/index.md +64 -0
  43. data/docs/examples/news-analysis.md +288 -0
  44. data/docs/getting-started/database-setup.md +170 -0
  45. data/docs/getting-started/index.md +71 -0
  46. data/docs/getting-started/installation.md +98 -0
  47. data/docs/getting-started/quick-start.md +191 -0
  48. data/docs/guides/batch-processing.md +325 -0
  49. data/docs/guides/configuration.md +243 -0
  50. data/docs/guides/entity-management.md +364 -0
  51. data/docs/guides/extracting-facts.md +299 -0
  52. data/docs/guides/index.md +22 -0
  53. data/docs/guides/ingesting-content.md +252 -0
  54. data/docs/guides/llm-integration.md +299 -0
  55. data/docs/guides/temporal-queries.md +315 -0
  56. data/docs/index.md +121 -0
  57. data/examples/README.md +130 -0
  58. data/examples/basic_usage.rb +164 -0
  59. data/examples/entity_management.rb +216 -0
  60. data/examples/hr_system.rb +428 -0
  61. data/examples/rule_based_extraction.rb +258 -0
  62. data/examples/temporal_queries.rb +245 -0
  63. data/lib/fact_db/config.rb +71 -0
  64. data/lib/fact_db/database.rb +45 -0
  65. data/lib/fact_db/errors.rb +10 -0
  66. data/lib/fact_db/extractors/base.rb +117 -0
  67. data/lib/fact_db/extractors/llm_extractor.rb +179 -0
  68. data/lib/fact_db/extractors/manual_extractor.rb +53 -0
  69. data/lib/fact_db/extractors/rule_based_extractor.rb +228 -0
  70. data/lib/fact_db/llm/adapter.rb +109 -0
  71. data/lib/fact_db/models/content.rb +62 -0
  72. data/lib/fact_db/models/entity.rb +84 -0
  73. data/lib/fact_db/models/entity_alias.rb +26 -0
  74. data/lib/fact_db/models/entity_mention.rb +33 -0
  75. data/lib/fact_db/models/fact.rb +192 -0
  76. data/lib/fact_db/models/fact_source.rb +35 -0
  77. data/lib/fact_db/pipeline/extraction_pipeline.rb +146 -0
  78. data/lib/fact_db/pipeline/resolution_pipeline.rb +129 -0
  79. data/lib/fact_db/resolution/entity_resolver.rb +261 -0
  80. data/lib/fact_db/resolution/fact_resolver.rb +259 -0
  81. data/lib/fact_db/services/content_service.rb +93 -0
  82. data/lib/fact_db/services/entity_service.rb +150 -0
  83. data/lib/fact_db/services/fact_service.rb +193 -0
  84. data/lib/fact_db/temporal/query.rb +125 -0
  85. data/lib/fact_db/temporal/timeline.rb +134 -0
  86. data/lib/fact_db/version.rb +5 -0
  87. data/lib/fact_db.rb +141 -0
  88. data/mkdocs.yml +198 -0
  89. metadata +288 -0
@@ -0,0 +1,53 @@
1
+ # frozen_string_literal: true
2
+
3
+ module FactDb
4
+ module Extractors
5
+ class ManualExtractor < Base
6
+ # Manual extraction passes through the text as a single fact
7
+ # This is used for API-driven fact creation where the user
8
+ # provides the fact text and metadata directly
9
+ def extract(text, context = {})
10
+ return [] if text.nil? || text.strip.empty?
11
+
12
+ valid_at = context[:valid_at] || context[:captured_at] || Time.current
13
+
14
+ [
15
+ build_fact(
16
+ text: text,
17
+ valid_at: valid_at,
18
+ invalid_at: context[:invalid_at],
19
+ mentions: context[:mentions] || [],
20
+ confidence: context[:confidence] || 1.0,
21
+ metadata: context[:metadata] || {}
22
+ )
23
+ ]
24
+ end
25
+
26
+ # Manual extraction expects entities to be provided explicitly
27
+ def extract_entities(text)
28
+ []
29
+ end
30
+
31
+ # Convenience method for creating a single fact with full control
32
+ def create_fact(text:, valid_at:, invalid_at: nil, mentions: [], confidence: 1.0, metadata: {})
33
+ extract(text, {
34
+ valid_at: valid_at,
35
+ invalid_at: invalid_at,
36
+ mentions: mentions,
37
+ confidence: confidence,
38
+ metadata: metadata
39
+ }).first
40
+ end
41
+
42
+ # Convenience method for creating an entity
43
+ def create_entity(name:, type:, aliases: [], attributes: {})
44
+ build_entity(
45
+ name: name,
46
+ type: type,
47
+ aliases: aliases,
48
+ attributes: attributes
49
+ )
50
+ end
51
+ end
52
+ end
53
+ end
@@ -0,0 +1,228 @@
1
+ # frozen_string_literal: true
2
+
3
+ module FactDb
4
+ module Extractors
5
+ class RuleBasedExtractor < Base
6
+ # Date patterns for temporal extraction
7
+ DATE_PATTERNS = [
8
+ # "on January 10, 2024"
9
+ /(?:on|since|from|as of|starting)\s+(\w+\s+\d{1,2},?\s+\d{4})/i,
10
+ # "on 2024-01-10"
11
+ /(?:on|since|from|as of|starting)\s+(\d{4}-\d{2}-\d{2})/i,
12
+ # "in January 2024"
13
+ /(?:in|during)\s+(\w+\s+\d{4})/i,
14
+ # "in 2024"
15
+ /(?:in|during)\s+(\d{4})\b/i
16
+ ].freeze
17
+
18
+ END_DATE_PATTERNS = [
19
+ # "until January 10, 2024"
20
+ /(?:until|through|to|ended|left)\s+(\w+\s+\d{1,2},?\s+\d{4})/i,
21
+ /(?:until|through|to|ended|left)\s+(\d{4}-\d{2}-\d{2})/i
22
+ ].freeze
23
+
24
+ # Employment patterns
25
+ EMPLOYMENT_PATTERNS = [
26
+ # "Paula works at Microsoft"
27
+ /(\b[A-Z][a-z]+(?:\s+[A-Z][a-z]+)*)\s+(?:works?|worked|is working)\s+(?:at|for)\s+(\b[A-Z][A-Za-z]+(?:\s+[A-Z][A-Za-z]+)*)/,
28
+ # "Paula joined Microsoft"
29
+ /(\b[A-Z][a-z]+(?:\s+[A-Z][a-z]+)*)\s+(?:joined|started at|was hired by)\s+(\b[A-Z][A-Za-z]+(?:\s+[A-Z][A-Za-z]+)*)/,
30
+ # "Paula left Microsoft"
31
+ /(\b[A-Z][a-z]+(?:\s+[A-Z][a-z]+)*)\s+(?:left|departed|resigned from|was fired from)\s+(\b[A-Z][A-Za-z]+(?:\s+[A-Z][A-Za-z]+)*)/,
32
+ # "Paula is a Principal Engineer at Microsoft"
33
+ /(\b[A-Z][a-z]+(?:\s+[A-Z][a-z]+)*)\s+(?:is|was|became)\s+(?:a\s+)?([A-Z][A-Za-z]+(?:\s+[A-Z][A-Za-z]+)*)\s+at\s+(\b[A-Z][A-Za-z]+(?:\s+[A-Z][A-Za-z]+)*)/
34
+ ].freeze
35
+
36
+ # Relationship patterns
37
+ RELATIONSHIP_PATTERNS = [
38
+ # "Paula is married to John"
39
+ /(\b[A-Z][a-z]+(?:\s+[A-Z][a-z]+)*)\s+(?:is|was)\s+(?:married to|engaged to|dating)\s+(\b[A-Z][a-z]+(?:\s+[A-Z][a-z]+)*)/,
40
+ # "Paula is the CEO of Microsoft"
41
+ /(\b[A-Z][a-z]+(?:\s+[A-Z][a-z]+)*)\s+(?:is|was)\s+(?:the\s+)?(\w+(?:\s+\w+)*)\s+of\s+(\b[A-Z][A-Za-z]+(?:\s+[A-Z][A-Za-z]+)*)/
42
+ ].freeze
43
+
44
+ # Location patterns
45
+ LOCATION_PATTERNS = [
46
+ # "Paula lives in Seattle"
47
+ /(\b[A-Z][a-z]+(?:\s+[A-Z][a-z]+)*)\s+(?:lives?|lived|is based|was based|relocated)\s+(?:in|to)\s+(\b[A-Z][A-Za-z]+(?:,?\s+[A-Z]{2})?)/,
48
+ # "Microsoft is headquartered in Redmond"
49
+ /(\b[A-Z][A-Za-z]+(?:\s+[A-Z][A-Za-z]+)*)\s+(?:is|was)\s+(?:headquartered|located|based)\s+in\s+(\b[A-Z][A-Za-z]+(?:,?\s+[A-Z]{2})?)/
50
+ ].freeze
51
+
52
+ def extract(text, context = {})
53
+ return [] if text.nil? || text.strip.empty?
54
+
55
+ facts = []
56
+
57
+ # Extract employment facts
58
+ facts.concat(extract_employment_facts(text, context))
59
+
60
+ # Extract relationship facts
61
+ facts.concat(extract_relationship_facts(text, context))
62
+
63
+ # Extract location facts
64
+ facts.concat(extract_location_facts(text, context))
65
+
66
+ facts.uniq { |f| f[:text] }
67
+ end
68
+
69
+ def extract_entities(text)
70
+ return [] if text.nil? || text.strip.empty?
71
+
72
+ entities = []
73
+
74
+ # Extract person names (simple capitalized word sequences)
75
+ text.scan(/\b([A-Z][a-z]+(?:\s+[A-Z][a-z]+)+)\b/).flatten.uniq.each do |name|
76
+ next if common_word?(name)
77
+
78
+ entities << build_entity(name: name, type: "person")
79
+ end
80
+
81
+ # Extract organization names (from employment patterns)
82
+ EMPLOYMENT_PATTERNS.each do |pattern|
83
+ text.scan(pattern).each do |match|
84
+ org_name = match.last
85
+ entities << build_entity(name: org_name, type: "organization") unless common_word?(org_name)
86
+ end
87
+ end
88
+
89
+ # Extract locations
90
+ LOCATION_PATTERNS.each do |pattern|
91
+ text.scan(pattern).each do |match|
92
+ location = match.last
93
+ entities << build_entity(name: location, type: "place") unless common_word?(location)
94
+ end
95
+ end
96
+
97
+ entities.uniq { |e| e[:name].downcase }
98
+ end
99
+
100
+ private
101
+
102
+ def extract_employment_facts(text, context)
103
+ facts = []
104
+ default_date = context[:captured_at] || Time.current
105
+
106
+ EMPLOYMENT_PATTERNS.each do |pattern|
107
+ text.scan(pattern).each do |match|
108
+ person, *rest = match
109
+ org = rest.last
110
+
111
+ # Determine if this is a "left" pattern
112
+ is_termination = text.match?(/#{Regexp.escape(person)}\s+(?:left|departed|resigned|was fired)/i)
113
+
114
+ fact_text = match.join(" ").gsub(/\s+/, " ")
115
+ valid_at = extract_start_date(text) || default_date
116
+ invalid_at = is_termination ? (extract_end_date(text) || default_date) : nil
117
+
118
+ mentions = [
119
+ build_mention(name: person, type: "person", role: "subject"),
120
+ build_mention(name: org, type: "organization", role: "object")
121
+ ]
122
+
123
+ # Add role if present
124
+ if rest.length > 1
125
+ mentions << build_mention(name: rest[0], type: "concept", role: "instrument")
126
+ end
127
+
128
+ facts << build_fact(
129
+ text: fact_text,
130
+ valid_at: valid_at,
131
+ invalid_at: invalid_at,
132
+ mentions: mentions,
133
+ confidence: 0.8
134
+ )
135
+ end
136
+ end
137
+
138
+ facts
139
+ end
140
+
141
+ def extract_relationship_facts(text, context)
142
+ facts = []
143
+ default_date = context[:captured_at] || Time.current
144
+
145
+ RELATIONSHIP_PATTERNS.each do |pattern|
146
+ text.scan(pattern).each do |match|
147
+ fact_text = match.join(" ").gsub(/\s+/, " ")
148
+
149
+ mentions = match.map.with_index do |name, i|
150
+ role = i.zero? ? "subject" : "object"
151
+ build_mention(name: name, type: "person", role: role)
152
+ end
153
+
154
+ facts << build_fact(
155
+ text: fact_text,
156
+ valid_at: extract_start_date(text) || default_date,
157
+ invalid_at: extract_end_date(text),
158
+ mentions: mentions,
159
+ confidence: 0.75
160
+ )
161
+ end
162
+ end
163
+
164
+ facts
165
+ end
166
+
167
+ def extract_location_facts(text, context)
168
+ facts = []
169
+ default_date = context[:captured_at] || Time.current
170
+
171
+ LOCATION_PATTERNS.each do |pattern|
172
+ text.scan(pattern).each do |match|
173
+ entity_name, location = match
174
+ fact_text = "#{entity_name} is located in #{location}"
175
+
176
+ # Determine entity type
177
+ entity_type = text.match?(/#{Regexp.escape(entity_name)}\s+(?:lives?|lived)/i) ? "person" : "organization"
178
+
179
+ mentions = [
180
+ build_mention(name: entity_name, type: entity_type, role: "subject"),
181
+ build_mention(name: location, type: "place", role: "location")
182
+ ]
183
+
184
+ facts << build_fact(
185
+ text: fact_text,
186
+ valid_at: extract_start_date(text) || default_date,
187
+ invalid_at: nil,
188
+ mentions: mentions,
189
+ confidence: 0.7
190
+ )
191
+ end
192
+ end
193
+
194
+ facts
195
+ end
196
+
197
+ def extract_start_date(text)
198
+ DATE_PATTERNS.each do |pattern|
199
+ if (match = text.match(pattern))
200
+ return parse_date(match[1])
201
+ end
202
+ end
203
+ nil
204
+ end
205
+
206
+ def extract_end_date(text)
207
+ END_DATE_PATTERNS.each do |pattern|
208
+ if (match = text.match(pattern))
209
+ return parse_date(match[1])
210
+ end
211
+ end
212
+ nil
213
+ end
214
+
215
+ def common_word?(word)
216
+ common_words = %w[
217
+ The A An And Or But Is Was Were Are Been
218
+ Has Have Had Will Would Could Should
219
+ This That These Those
220
+ January February March April May June July August September October November December
221
+ Monday Tuesday Wednesday Thursday Friday Saturday Sunday
222
+ Inc Corp Ltd LLC Company Corporation
223
+ ]
224
+ common_words.any? { |w| w.casecmp?(word) }
225
+ end
226
+ end
227
+ end
228
+ end
@@ -0,0 +1,109 @@
1
+ # frozen_string_literal: true
2
+
3
+ module FactDb
4
+ module LLM
5
+ # Adapter for ruby_llm gem
6
+ # Provides a unified interface for the LLM extractor
7
+ #
8
+ # @example Configure with OpenAI
9
+ # FactDb.configure do |config|
10
+ # config.llm_client = FactDb::LLM::Adapter.new(
11
+ # provider: :openai,
12
+ # api_key: ENV["OPENAI_API_KEY"],
13
+ # model: "gpt-4o-mini"
14
+ # )
15
+ # end
16
+ #
17
+ # @example Configure with Anthropic
18
+ # FactDb.configure do |config|
19
+ # config.llm_client = FactDb::LLM::Adapter.new(
20
+ # provider: :anthropic,
21
+ # api_key: ENV["ANTHROPIC_API_KEY"],
22
+ # model: "claude-sonnet-4-20250514"
23
+ # )
24
+ # end
25
+ #
26
+ # @example Configure via YAML (config/fact_db.yml)
27
+ # # llm_provider: anthropic
28
+ # # llm_model: claude-sonnet-4-20250514
29
+ # # llm_api_key: <%= ENV["ANTHROPIC_API_KEY"] %>
30
+ #
31
+ # @example Configure via environment variables
32
+ # # EVENT_CLOCK_LLM_PROVIDER=anthropic
33
+ # # EVENT_CLOCK_LLM_MODEL=claude-sonnet-4-20250514
34
+ # # EVENT_CLOCK_LLM_API_KEY=sk-...
35
+ #
36
+ class Adapter
37
+ attr_reader :model, :provider
38
+
39
+ PROVIDER_DEFAULTS = {
40
+ openai: "gpt-4o-mini",
41
+ anthropic: "claude-sonnet-4-20250514",
42
+ gemini: "gemini-2.0-flash",
43
+ ollama: "llama3.2",
44
+ bedrock: "claude-sonnet-4",
45
+ openrouter: "anthropic/claude-sonnet-4"
46
+ }.freeze
47
+
48
+ # Create an adapter for a specific LLM provider
49
+ #
50
+ # @param provider [Symbol] :openai, :anthropic, :gemini, :ollama, :bedrock, :openrouter
51
+ # @param model [String] Model name (optional, uses provider default)
52
+ # @param api_key [String] API key (optional if set via ENV)
53
+ # @param options [Hash] Additional options passed to RubyLLM
54
+ #
55
+ def initialize(provider:, model: nil, api_key: nil, **options)
56
+ @provider = provider.to_sym
57
+ @model = model || PROVIDER_DEFAULTS[@provider]
58
+ @options = options
59
+
60
+ configure_ruby_llm(api_key)
61
+ end
62
+
63
+ # Send a prompt to the LLM and return the response text
64
+ #
65
+ # @param prompt [String] The prompt to send
66
+ # @return [String] The response text
67
+ def chat(prompt)
68
+ chat_instance = RubyLLM.chat(model: model)
69
+ response = chat_instance.ask(prompt)
70
+ response.content
71
+ end
72
+
73
+ # Alias for compatibility with different client interfaces
74
+ alias call chat
75
+ alias complete chat
76
+
77
+ private
78
+
79
+ def configure_ruby_llm(api_key)
80
+ require "ruby_llm"
81
+
82
+ RubyLLM.configure do |config|
83
+ case provider
84
+ when :openai
85
+ config.openai_api_key = api_key || ENV.fetch("OPENAI_API_KEY", nil)
86
+ when :anthropic
87
+ config.anthropic_api_key = api_key || ENV.fetch("ANTHROPIC_API_KEY", nil)
88
+ when :gemini
89
+ config.gemini_api_key = api_key || ENV.fetch("GEMINI_API_KEY", nil)
90
+ when :ollama
91
+ config.ollama_api_base = @options[:api_base] || "http://localhost:11434"
92
+ when :bedrock
93
+ config.bedrock_region = @options[:region] || ENV.fetch("AWS_REGION", "us-east-1")
94
+ config.bedrock_api_key = api_key || ENV.fetch("AWS_ACCESS_KEY_ID", nil)
95
+ config.bedrock_secret_key = @options[:secret_key] || ENV.fetch("AWS_SECRET_ACCESS_KEY", nil)
96
+ when :openrouter
97
+ config.openrouter_api_key = api_key || ENV.fetch("OPENROUTER_API_KEY", nil)
98
+ else
99
+ raise ConfigurationError, "Unknown LLM provider: #{provider}. " \
100
+ "Supported: openai, anthropic, gemini, ollama, bedrock, openrouter"
101
+ end
102
+ end
103
+ rescue LoadError
104
+ raise ConfigurationError, "LLM adapter requires the 'ruby_llm' gem. Add it to your Gemfile:\n" \
105
+ " gem 'ruby_llm'"
106
+ end
107
+ end
108
+ end
109
+ end
@@ -0,0 +1,62 @@
1
+ # frozen_string_literal: true
2
+
3
+ module FactDb
4
+ module Models
5
+ class Content < ActiveRecord::Base
6
+ self.table_name = "fact_db_contents"
7
+
8
+ has_many :fact_sources, class_name: "FactDb::Models::FactSource",
9
+ foreign_key: :content_id, dependent: :destroy
10
+ has_many :facts, through: :fact_sources
11
+
12
+ validates :content_hash, presence: true, uniqueness: true
13
+ validates :content_type, presence: true
14
+ validates :raw_text, presence: true
15
+ validates :captured_at, presence: true
16
+
17
+ before_validation :generate_content_hash, on: :create
18
+
19
+ # Content types
20
+ TYPES = %w[email transcript document slack meeting_notes contract report].freeze
21
+
22
+ validates :content_type, inclusion: { in: TYPES }, allow_nil: false
23
+
24
+ scope :by_type, ->(type) { where(content_type: type) }
25
+ scope :captured_between, ->(from, to) { where(captured_at: from..to) }
26
+ scope :captured_after, ->(date) { where("captured_at >= ?", date) }
27
+ scope :captured_before, ->(date) { where("captured_at <= ?", date) }
28
+
29
+ # Full-text search
30
+ scope :search_text, lambda { |query|
31
+ where("to_tsvector('english', raw_text) @@ plainto_tsquery('english', ?)", query)
32
+ }
33
+
34
+ # Vector similarity search (requires neighbor gem configured)
35
+ def self.nearest_neighbors(embedding, limit: 10)
36
+ return none unless embedding
37
+
38
+ order(Arel.sql("embedding <=> '#{embedding}'")).limit(limit)
39
+ end
40
+
41
+ def immutable?
42
+ true
43
+ end
44
+
45
+ def word_count
46
+ raw_text.split.size
47
+ end
48
+
49
+ def preview(length: 200)
50
+ return raw_text if raw_text.length <= length
51
+
52
+ "#{raw_text[0, length]}..."
53
+ end
54
+
55
+ private
56
+
57
+ def generate_content_hash
58
+ self.content_hash = Digest::SHA256.hexdigest(raw_text) if raw_text.present?
59
+ end
60
+ end
61
+ end
62
+ end
@@ -0,0 +1,84 @@
1
+ # frozen_string_literal: true
2
+
3
+ module FactDb
4
+ module Models
5
+ class Entity < ActiveRecord::Base
6
+ self.table_name = "fact_db_entities"
7
+
8
+ has_many :aliases, class_name: "FactDb::Models::EntityAlias",
9
+ foreign_key: :entity_id, dependent: :destroy
10
+ has_many :entity_mentions, class_name: "FactDb::Models::EntityMention",
11
+ foreign_key: :entity_id, dependent: :destroy
12
+ has_many :facts, through: :entity_mentions
13
+
14
+ belongs_to :merged_into, class_name: "FactDb::Models::Entity",
15
+ foreign_key: :merged_into_id, optional: true
16
+ has_many :merged_entities, class_name: "FactDb::Models::Entity",
17
+ foreign_key: :merged_into_id
18
+
19
+ validates :canonical_name, presence: true
20
+ validates :entity_type, presence: true
21
+ validates :resolution_status, presence: true
22
+
23
+ # Entity types
24
+ TYPES = %w[person organization place product event concept].freeze
25
+ STATUSES = %w[unresolved resolved merged split].freeze
26
+
27
+ validates :entity_type, inclusion: { in: TYPES }
28
+ validates :resolution_status, inclusion: { in: STATUSES }
29
+
30
+ scope :by_type, ->(type) { where(entity_type: type) }
31
+ scope :resolved, -> { where(resolution_status: "resolved") }
32
+ scope :unresolved, -> { where(resolution_status: "unresolved") }
33
+ scope :not_merged, -> { where.not(resolution_status: "merged") }
34
+ scope :people, -> { by_type("person") }
35
+ scope :organizations, -> { by_type("organization") }
36
+ scope :places, -> { by_type("place") }
37
+
38
+ def resolved?
39
+ resolution_status == "resolved"
40
+ end
41
+
42
+ def merged?
43
+ resolution_status == "merged"
44
+ end
45
+
46
+ def canonical_entity
47
+ merged? ? merged_into&.canonical_entity || merged_into : self
48
+ end
49
+
50
+ def all_aliases
51
+ aliases.pluck(:alias_text)
52
+ end
53
+
54
+ def add_alias(text, type: nil, confidence: 1.0)
55
+ aliases.find_or_create_by!(alias_text: text) do |a|
56
+ a.alias_type = type
57
+ a.confidence = confidence
58
+ end
59
+ end
60
+
61
+ def matches_name?(name)
62
+ return true if canonical_name.downcase == name.downcase
63
+
64
+ aliases.exists?(["LOWER(alias_text) = ?", name.downcase])
65
+ end
66
+
67
+ # Get all facts mentioning this entity
68
+ def current_facts
69
+ facts.currently_valid.canonical
70
+ end
71
+
72
+ def facts_at(date)
73
+ facts.valid_at(date).canonical
74
+ end
75
+
76
+ # Vector similarity search for entity matching
77
+ def self.nearest_neighbors(embedding, limit: 10)
78
+ return none unless embedding
79
+
80
+ order(Arel.sql("embedding <=> '#{embedding}'")).limit(limit)
81
+ end
82
+ end
83
+ end
84
+ end
@@ -0,0 +1,26 @@
1
+ # frozen_string_literal: true
2
+
3
+ module FactDb
4
+ module Models
5
+ class EntityAlias < ActiveRecord::Base
6
+ self.table_name = "fact_db_entity_aliases"
7
+
8
+ belongs_to :entity, class_name: "FactDb::Models::Entity"
9
+
10
+ validates :alias_text, presence: true
11
+ validates :alias_text, uniqueness: { scope: :entity_id }
12
+
13
+ # Alias types
14
+ TYPES = %w[name nickname email handle abbreviation title].freeze
15
+
16
+ validates :alias_type, inclusion: { in: TYPES }, allow_nil: true
17
+
18
+ scope :by_type, ->(type) { where(alias_type: type) }
19
+ scope :high_confidence, -> { where("confidence >= ?", 0.9) }
20
+
21
+ def self.find_entity_by_alias(text)
22
+ find_by(["LOWER(alias_text) = ?", text.downcase])&.entity
23
+ end
24
+ end
25
+ end
26
+ end
@@ -0,0 +1,33 @@
1
+ # frozen_string_literal: true
2
+
3
+ module FactDb
4
+ module Models
5
+ class EntityMention < ActiveRecord::Base
6
+ self.table_name = "fact_db_entity_mentions"
7
+
8
+ belongs_to :fact, class_name: "FactDb::Models::Fact"
9
+ belongs_to :entity, class_name: "FactDb::Models::Entity"
10
+
11
+ validates :mention_text, presence: true
12
+ validates :fact_id, uniqueness: { scope: [:entity_id, :mention_text] }
13
+
14
+ # Mention roles
15
+ ROLES = %w[subject object location temporal instrument beneficiary].freeze
16
+
17
+ validates :mention_role, inclusion: { in: ROLES }, allow_nil: true
18
+
19
+ scope :by_role, ->(role) { where(mention_role: role) }
20
+ scope :subjects, -> { by_role("subject") }
21
+ scope :objects, -> { by_role("object") }
22
+ scope :high_confidence, -> { where("confidence >= ?", 0.9) }
23
+
24
+ def subject?
25
+ mention_role == "subject"
26
+ end
27
+
28
+ def object?
29
+ mention_role == "object"
30
+ end
31
+ end
32
+ end
33
+ end