fact_db 0.0.2 → 0.0.3

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (114) hide show
  1. checksums.yaml +4 -4
  2. data/.envrc +2 -0
  3. data/.yardopts +5 -0
  4. data/CHANGELOG.md +64 -0
  5. data/README.md +107 -6
  6. data/Rakefile +243 -10
  7. data/db/migrate/001_enable_extensions.rb +1 -0
  8. data/db/migrate/002_create_sources.rb +49 -0
  9. data/db/migrate/003_create_entities.rb +27 -15
  10. data/db/migrate/004_create_entity_aliases.rb +20 -7
  11. data/db/migrate/005_create_facts.rb +37 -21
  12. data/db/migrate/006_create_entity_mentions.rb +14 -6
  13. data/db/migrate/007_create_fact_sources.rb +16 -8
  14. data/docs/api/extractors/index.md +5 -5
  15. data/docs/api/extractors/llm.md +17 -17
  16. data/docs/api/extractors/rule-based.md +14 -14
  17. data/docs/api/facts.md +20 -20
  18. data/docs/api/index.md +4 -4
  19. data/docs/api/models/entity.md +21 -21
  20. data/docs/api/models/fact.md +15 -15
  21. data/docs/api/models/index.md +7 -7
  22. data/docs/api/models/{content.md → source.md} +29 -29
  23. data/docs/api/pipeline/extraction.md +25 -25
  24. data/docs/api/pipeline/index.md +1 -1
  25. data/docs/api/pipeline/resolution.md +4 -4
  26. data/docs/api/services/entity-service.md +20 -20
  27. data/docs/api/services/fact-service.md +12 -12
  28. data/docs/api/services/index.md +5 -5
  29. data/docs/api/services/{content-service.md → source-service.md} +27 -27
  30. data/docs/architecture/database-schema.md +46 -46
  31. data/docs/architecture/entity-resolution.md +6 -6
  32. data/docs/architecture/index.md +10 -10
  33. data/docs/architecture/temporal-facts.md +5 -5
  34. data/docs/architecture/three-layer-model.md +17 -17
  35. data/docs/concepts.md +6 -6
  36. data/docs/examples/basic-usage.md +20 -20
  37. data/docs/examples/hr-onboarding.md +17 -17
  38. data/docs/examples/index.md +4 -4
  39. data/docs/examples/news-analysis.md +23 -23
  40. data/docs/getting-started/database-setup.md +28 -20
  41. data/docs/getting-started/index.md +3 -3
  42. data/docs/getting-started/quick-start.md +33 -30
  43. data/docs/guides/batch-processing.md +26 -26
  44. data/docs/guides/configuration.md +158 -77
  45. data/docs/guides/entity-management.md +14 -14
  46. data/docs/guides/extracting-facts.md +28 -28
  47. data/docs/guides/ingesting-content.md +14 -14
  48. data/docs/guides/llm-integration.md +40 -32
  49. data/docs/guides/temporal-queries.md +11 -11
  50. data/docs/index.md +6 -2
  51. data/examples/.envrc +4 -0
  52. data/examples/.gitignore +1 -0
  53. data/examples/001_configuration.rb +312 -0
  54. data/examples/{basic_usage.rb → 010_basic_usage.rb} +47 -56
  55. data/examples/{entity_management.rb → 020_entity_management.rb} +57 -72
  56. data/examples/{temporal_queries.rb → 030_temporal_queries.rb} +39 -59
  57. data/examples/040_output_formats.rb +177 -0
  58. data/examples/{rule_based_extraction.rb → 050_rule_based_extraction.rb} +39 -45
  59. data/examples/060_fluent_temporal_api.rb +217 -0
  60. data/examples/070_introspection.rb +252 -0
  61. data/examples/{hr_system.rb → 080_hr_system.rb} +56 -75
  62. data/examples/090_ingest_demo.rb +515 -0
  63. data/examples/100_query_context.rb +668 -0
  64. data/examples/110_prove_it.rb +204 -0
  65. data/examples/120_dump_database.rb +358 -0
  66. data/examples/130_rag_feedback_loop.rb +858 -0
  67. data/examples/README.md +229 -15
  68. data/examples/data/lincoln_associates.md +201 -0
  69. data/examples/data/lincoln_biography.md +66 -0
  70. data/examples/data/lincoln_cabinet.md +243 -0
  71. data/examples/data/lincoln_family.md +163 -0
  72. data/examples/data/lincoln_military.md +241 -0
  73. data/examples/data/lincoln_todd_family.md +136 -0
  74. data/examples/ingest_reporter.rb +335 -0
  75. data/examples/utilities.rb +182 -0
  76. data/lib/fact_db/config/defaults.yml +254 -0
  77. data/lib/fact_db/config.rb +94 -35
  78. data/lib/fact_db/database.rb +98 -8
  79. data/lib/fact_db/extractors/base.rb +106 -21
  80. data/lib/fact_db/extractors/llm_extractor.rb +35 -63
  81. data/lib/fact_db/extractors/manual_extractor.rb +46 -6
  82. data/lib/fact_db/extractors/rule_based_extractor.rb +136 -25
  83. data/lib/fact_db/llm/adapter.rb +3 -3
  84. data/lib/fact_db/models/entity.rb +94 -22
  85. data/lib/fact_db/models/entity_alias.rb +41 -7
  86. data/lib/fact_db/models/entity_mention.rb +34 -1
  87. data/lib/fact_db/models/fact.rb +259 -28
  88. data/lib/fact_db/models/fact_source.rb +43 -9
  89. data/lib/fact_db/models/source.rb +113 -0
  90. data/lib/fact_db/pipeline/extraction_pipeline.rb +35 -35
  91. data/lib/fact_db/pipeline/resolution_pipeline.rb +5 -5
  92. data/lib/fact_db/query_result.rb +202 -0
  93. data/lib/fact_db/resolution/entity_resolver.rb +139 -39
  94. data/lib/fact_db/resolution/fact_resolver.rb +86 -14
  95. data/lib/fact_db/services/entity_service.rb +246 -37
  96. data/lib/fact_db/services/fact_service.rb +254 -17
  97. data/lib/fact_db/services/source_service.rb +164 -0
  98. data/lib/fact_db/temporal/query.rb +71 -7
  99. data/lib/fact_db/temporal/query_builder.rb +69 -0
  100. data/lib/fact_db/temporal/timeline.rb +102 -11
  101. data/lib/fact_db/transformers/base.rb +77 -0
  102. data/lib/fact_db/transformers/cypher_transformer.rb +185 -0
  103. data/lib/fact_db/transformers/json_transformer.rb +17 -0
  104. data/lib/fact_db/transformers/raw_transformer.rb +35 -0
  105. data/lib/fact_db/transformers/text_transformer.rb +114 -0
  106. data/lib/fact_db/transformers/triple_transformer.rb +138 -0
  107. data/lib/fact_db/validation/alias_filter.rb +185 -0
  108. data/lib/fact_db/version.rb +1 -1
  109. data/lib/fact_db.rb +281 -30
  110. data/mkdocs.yml +2 -2
  111. metadata +60 -16
  112. data/db/migrate/002_create_contents.rb +0 -44
  113. data/lib/fact_db/models/content.rb +0 -62
  114. data/lib/fact_db/services/content_service.rb +0 -93
@@ -2,20 +2,44 @@
2
2
 
3
3
  module FactDb
4
4
  module Services
5
+ # Service class for managing entities in the database
6
+ #
7
+ # Provides methods for creating, searching, and managing entities including
8
+ # name resolution, alias management, and duplicate detection.
9
+ #
10
+ # @example Basic usage
11
+ # service = EntityService.new
12
+ # entity = service.create("John Smith", kind: :person)
13
+ #
5
14
  class EntityService
6
- attr_reader :config, :resolver
15
+ # @return [FactDb::Config] the configuration object
16
+ attr_reader :config
7
17
 
18
+ # @return [FactDb::Resolution::EntityResolver] the entity resolver instance
19
+ attr_reader :resolver
20
+
21
+ # Initializes a new EntityService instance
22
+ #
23
+ # @param config [FactDb::Config] configuration object (defaults to FactDb.config)
8
24
  def initialize(config = FactDb.config)
9
25
  @config = config
10
26
  @resolver = Resolution::EntityResolver.new(config)
11
27
  end
12
28
 
13
- def create(name, type:, aliases: [], attributes: {}, description: nil)
29
+ # Creates a new entity in the database
30
+ #
31
+ # @param name [String] the canonical name
32
+ # @param kind [Symbol, String] entity kind (:person, :organization, etc.)
33
+ # @param aliases [Array<String>] alternative names
34
+ # @param attributes [Hash] additional metadata attributes
35
+ # @param description [String, nil] entity description
36
+ # @return [FactDb::Models::Entity] the created entity
37
+ def create(name, kind:, aliases: [], attributes: {}, description: nil)
14
38
  embedding = generate_embedding(name)
15
39
 
16
40
  entity = Models::Entity.create!(
17
- canonical_name: name,
18
- entity_type: type.to_s,
41
+ name: name,
42
+ kind: kind.to_s,
19
43
  description: description,
20
44
  metadata: attributes,
21
45
  resolution_status: "resolved",
@@ -29,81 +53,202 @@ module FactDb
29
53
  entity
30
54
  end
31
55
 
56
+ # Finds an entity by ID
57
+ #
58
+ # @param id [Integer] the entity ID
59
+ # @return [FactDb::Models::Entity] the found entity
60
+ # @raise [ActiveRecord::RecordNotFound] if entity not found
32
61
  def find(id)
33
62
  Models::Entity.find(id)
34
63
  end
35
64
 
36
- def find_by_name(name, type: nil)
37
- scope = Models::Entity.where(["LOWER(canonical_name) = ?", name.downcase])
38
- scope = scope.where(entity_type: type) if type
65
+ # Finds an entity by exact name match
66
+ #
67
+ # @param name [String] the entity name (case-insensitive)
68
+ # @param kind [Symbol, String, nil] optional kind filter
69
+ # @return [FactDb::Models::Entity, nil] the found entity or nil
70
+ def find_by_name(name, kind: nil)
71
+ scope = Models::Entity.where(["LOWER(name) = ?", name.downcase])
72
+ scope = scope.where(kind: kind) if kind
39
73
  scope.not_merged.first
40
74
  end
41
75
 
42
- def resolve(name, type: nil)
43
- @resolver.resolve(name, type: type)
76
+ # Resolves a name to an existing entity
77
+ #
78
+ # Uses exact alias matching, canonical name matching, and fuzzy matching.
79
+ #
80
+ # @param name [String] the name to resolve
81
+ # @param kind [Symbol, nil] optional kind filter
82
+ # @return [FactDb::Resolution::ResolvedEntity, nil] resolved entity or nil
83
+ def resolve(name, kind: nil)
84
+ @resolver.resolve(name, kind: kind)
44
85
  end
45
86
 
46
- def resolve_or_create(name, type:, aliases: [], attributes: {}, description: nil)
47
- resolved = @resolver.resolve(name, type: type)
48
- return resolved.entity if resolved
87
+ # Resolves a name to an entity, creating one if not found
88
+ #
89
+ # Also checks if any provided aliases match existing entities.
90
+ #
91
+ # @param name [String] the name to resolve or create
92
+ # @param kind [Symbol, String] entity kind (required for creation)
93
+ # @param aliases [Array<String>] additional aliases
94
+ # @param attributes [Hash] additional attributes for new entity
95
+ # @param description [String, nil] entity description
96
+ # @return [FactDb::Models::Entity] the resolved or created entity
97
+ def resolve_or_create(name, kind:, aliases: [], attributes: {}, description: nil)
98
+ # First, try to resolve the canonical name
99
+ resolved = @resolver.resolve(name, kind: kind)
100
+ if resolved
101
+ # Add any new aliases to the resolved entity
102
+ add_new_aliases(resolved.entity, aliases)
103
+ return resolved.entity
104
+ end
49
105
 
50
- create(name, type: type, aliases: aliases, attributes: attributes, description: description)
106
+ # Check if any of the provided aliases match an existing entity
107
+ # This handles cases like: name="Lord", aliases=["Jesus"] where "Jesus" already exists
108
+ aliases.each do |alias_text|
109
+ next if alias_text.to_s.strip.empty?
110
+
111
+ resolved_by_alias = @resolver.resolve(alias_text.to_s.strip, kind: kind)
112
+ if resolved_by_alias
113
+ entity = resolved_by_alias.entity
114
+ # Add the new canonical name as an alias to the existing entity
115
+ entity.add_alias(name) unless entity.name.downcase == name.downcase
116
+ # Add all the other aliases too
117
+ add_new_aliases(entity, aliases)
118
+ return entity
119
+ end
120
+ end
121
+
122
+ create(name, kind: kind, aliases: aliases, attributes: attributes, description: description)
51
123
  end
52
124
 
125
+ # Merges two entities, keeping one as canonical
126
+ #
127
+ # @param keep_id [Integer] ID of the entity to keep
128
+ # @param merge_id [Integer] ID of the entity to merge
129
+ # @return [FactDb::Models::Entity] the kept entity
53
130
  def merge(keep_id, merge_id)
54
131
  @resolver.merge(keep_id, merge_id)
55
132
  end
56
133
 
57
- def add_alias(entity_id, alias_text, alias_type: nil, confidence: 1.0)
134
+ # Adds an alias to an entity
135
+ #
136
+ # @param entity_id [Integer] the entity ID
137
+ # @param alias_name [String] the alias text
138
+ # @param kind [String, nil] alias kind
139
+ # @param confidence [Float] confidence score
140
+ # @return [FactDb::Models::EntityAlias] the created alias
141
+ def add_alias(entity_id, alias_name, kind: nil, confidence: 1.0)
58
142
  entity = Models::Entity.find(entity_id)
59
- entity.add_alias(alias_text, type: alias_type, confidence: confidence)
143
+ entity.add_alias(alias_name, kind: kind, confidence: confidence)
60
144
  end
61
145
 
62
- def search(query, type: nil, limit: 20)
146
+ # Searches entities by name or alias using LIKE pattern matching
147
+ #
148
+ # @param query [String] the search query
149
+ # @param kind [Symbol, String, nil] optional kind filter
150
+ # @param limit [Integer] maximum number of results
151
+ # @return [ActiveRecord::Relation] matching entities
152
+ def search(query, kind: nil, limit: 20)
63
153
  scope = Models::Entity.not_merged
64
154
 
65
155
  # Search canonical names and aliases
66
156
  scope = scope.left_joins(:aliases).where(
67
- "LOWER(fact_db_entities.canonical_name) LIKE ? OR LOWER(fact_db_entity_aliases.alias_text) LIKE ?",
157
+ "LOWER(fact_db_entities.name) LIKE ? OR LOWER(fact_db_entity_aliases.name) LIKE ?",
68
158
  "%#{query.downcase}%",
69
159
  "%#{query.downcase}%"
70
160
  ).distinct
71
161
 
72
- scope = scope.where(entity_type: type) if type
162
+ scope = scope.where(kind: kind) if kind
73
163
  scope.limit(limit)
74
164
  end
75
165
 
76
- def semantic_search(query, type: nil, limit: 20)
166
+ # Searches entities using semantic similarity (vector search)
167
+ #
168
+ # Requires an embedding generator to be configured.
169
+ #
170
+ # @param query [String] the search query
171
+ # @param kind [Symbol, String, nil] optional kind filter
172
+ # @param limit [Integer] maximum number of results
173
+ # @return [ActiveRecord::Relation] semantically similar entities
174
+ def semantic_search(query, kind: nil, limit: 20)
77
175
  embedding = generate_embedding(query)
78
176
  return Models::Entity.none unless embedding
79
177
 
80
178
  scope = Models::Entity.not_merged.nearest_neighbors(embedding, limit: limit)
81
- scope = scope.where(entity_type: type) if type
179
+ scope = scope.where(kind: kind) if kind
82
180
  scope
83
181
  end
84
182
 
85
- def by_type(type)
86
- Models::Entity.by_type(type).not_merged.order(:canonical_name)
87
- end
183
+ # Searches entities using PostgreSQL trigram similarity (handles typos)
184
+ #
185
+ # Requires pg_trgm extension. Falls back to LIKE search if unavailable.
186
+ #
187
+ # @param query [String] search term (minimum 3 characters)
188
+ # @param kind [Symbol, String, nil] optional kind filter
189
+ # @param threshold [Float] minimum similarity score (0.0-1.0)
190
+ # @param limit [Integer] maximum number of results
191
+ # @return [Array<FactDb::Models::Entity>] entities ordered by similarity
192
+ def fuzzy_search(query, kind: nil, threshold: 0.3, limit: 20)
193
+ return [] if query.to_s.strip.length < 3
88
194
 
89
- def people(limit: nil)
90
- scope = Models::Entity.people.not_merged.order(:canonical_name)
91
- scope = scope.limit(limit) if limit
92
- scope
93
- end
195
+ sql = <<~SQL
196
+ SELECT DISTINCT e.id,
197
+ GREATEST(
198
+ similarity(LOWER(e.name), LOWER(?)),
199
+ COALESCE(MAX(similarity(LOWER(a.name), LOWER(?))), 0)
200
+ ) as sim_score
201
+ FROM fact_db_entities e
202
+ LEFT JOIN fact_db_entity_aliases a ON a.entity_id = e.id
203
+ WHERE e.resolution_status != 'merged'
204
+ AND (
205
+ similarity(LOWER(e.name), LOWER(?)) > ?
206
+ OR similarity(LOWER(a.name), LOWER(?)) > ?
207
+ )
208
+ GROUP BY e.id
209
+ ORDER BY sim_score DESC
210
+ LIMIT ?
211
+ SQL
94
212
 
95
- def organizations(limit: nil)
96
- scope = Models::Entity.organizations.not_merged.order(:canonical_name)
97
- scope = scope.limit(limit) if limit
98
- scope
213
+ sanitized = ActiveRecord::Base.sanitize_sql(
214
+ [sql, query, query, query, threshold, query, threshold, limit]
215
+ )
216
+
217
+ results = ActiveRecord::Base.connection.execute(sanitized)
218
+ entity_ids = results.map { |r| r["id"] }
219
+
220
+ return [] if entity_ids.empty?
221
+
222
+ # Preserve ordering by fetching in order
223
+ entities_by_id = Models::Entity.where(id: entity_ids).index_by(&:id)
224
+ ordered_entities = entity_ids.map { |id| entities_by_id[id] }.compact
225
+
226
+ # Apply kind filter if specified
227
+ if kind
228
+ ordered_entities = ordered_entities.select { |e| e.kind == kind.to_s }
229
+ end
230
+
231
+ ordered_entities
232
+ rescue ActiveRecord::StatementInvalid => e
233
+ # pg_trgm extension not available, fall back to LIKE search
234
+ config.logger&.warn("Fuzzy search unavailable (pg_trgm not installed): #{e.message}")
235
+ search(query, kind: kind, limit: limit).to_a
99
236
  end
100
237
 
101
- def places(limit: nil)
102
- scope = Models::Entity.places.not_merged.order(:canonical_name)
103
- scope = scope.limit(limit) if limit
104
- scope
238
+ # Returns entities of a specific kind
239
+ #
240
+ # @param kind [Symbol, String] the entity kind
241
+ # @return [ActiveRecord::Relation] entities of that kind
242
+ def by_kind(kind)
243
+ Models::Entity.by_kind(kind).not_merged.order(:name)
105
244
  end
106
245
 
246
+ # Returns facts about an entity
247
+ #
248
+ # @param entity_id [Integer] the entity ID
249
+ # @param at [Date, Time, nil] optional point in time
250
+ # @param status [Symbol] fact status filter
251
+ # @return [ActiveRecord::Relation] facts mentioning the entity
107
252
  def facts_about(entity_id, at: nil, status: :canonical)
108
253
  Temporal::Query.new.execute(
109
254
  entity_id: entity_id,
@@ -112,31 +257,95 @@ module FactDb
112
257
  )
113
258
  end
114
259
 
260
+ # Builds a timeline of facts for an entity
261
+ #
262
+ # @param entity_id [Integer] the entity ID
263
+ # @param from [Date, Time, nil] start of timeline range
264
+ # @param to [Date, Time, nil] end of timeline range
265
+ # @return [FactDb::Temporal::Timeline] timeline of facts
115
266
  def timeline_for(entity_id, from: nil, to: nil)
116
267
  Temporal::Timeline.new.build(entity_id: entity_id, from: from, to: to)
117
268
  end
118
269
 
270
+ # Finds potential duplicate entities
271
+ #
272
+ # @param threshold [Float, nil] minimum similarity score
273
+ # @return [Array<Hash>] array of potential duplicates
119
274
  def find_duplicates(threshold: nil)
120
275
  @resolver.find_duplicates(threshold: threshold)
121
276
  end
122
277
 
278
+ # Automatically merges high-confidence duplicates
279
+ #
280
+ # @return [void]
123
281
  def auto_merge_duplicates!
124
282
  @resolver.auto_merge_duplicates!
125
283
  end
126
284
 
285
+ # Returns aggregate statistics about entities
286
+ #
287
+ # @return [Hash] statistics including counts by kind and status
127
288
  def stats
128
289
  {
129
290
  total: Models::Entity.not_merged.count,
130
291
  total_count: Models::Entity.not_merged.count,
131
- by_type: Models::Entity.not_merged.group(:entity_type).count,
292
+ by_kind: Models::Entity.not_merged.group(:kind).count,
132
293
  by_status: Models::Entity.group(:resolution_status).count,
133
294
  merged_count: Models::Entity.where(resolution_status: "merged").count,
134
295
  with_facts: Models::Entity.joins(:entity_mentions).distinct.count
135
296
  }
136
297
  end
137
298
 
299
+ # Returns all relationship types used in the database
300
+ #
301
+ # @return [Array<Symbol>] relationship types (mention roles)
302
+ def relationship_types
303
+ Models::EntityMention.distinct.pluck(:mention_role).compact.map(&:to_sym)
304
+ end
305
+
306
+ # Returns relationship types for a specific entity
307
+ #
308
+ # @param entity_id [Integer] Entity ID
309
+ # @return [Array<Symbol>] Relationship types for this entity
310
+ def relationship_types_for(entity_id)
311
+ Models::EntityMention
312
+ .where(entity_id: entity_id)
313
+ .distinct
314
+ .pluck(:mention_role)
315
+ .compact
316
+ .map(&:to_sym)
317
+ end
318
+
319
+ # Returns the timespan of facts for an entity
320
+ #
321
+ # @param entity_id [Integer] Entity ID
322
+ # @return [Hash] Hash with :from and :to dates
323
+ def timespan_for(entity_id)
324
+ facts = Models::Fact
325
+ .joins(:entity_mentions)
326
+ .where(entity_mentions: { entity_id: entity_id })
327
+
328
+ {
329
+ from: facts.minimum(:valid_at),
330
+ to: facts.maximum(:valid_at) || Date.today
331
+ }
332
+ end
333
+
138
334
  private
139
335
 
336
+ def add_new_aliases(entity, aliases)
337
+ return unless aliases&.any?
338
+
339
+ # Filter out pronouns and generic terms
340
+ valid_aliases = Validation::AliasFilter.filter(aliases, name: entity.name)
341
+
342
+ valid_aliases.each do |alias_text|
343
+ next if entity.all_aliases.map(&:downcase).include?(alias_text.downcase)
344
+
345
+ entity.add_alias(alias_text)
346
+ end
347
+ end
348
+
140
349
  def generate_embedding(text)
141
350
  return nil unless config.embedding_generator
142
351