fact_db 0.0.2 → 0.0.3

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (114) hide show
  1. checksums.yaml +4 -4
  2. data/.envrc +2 -0
  3. data/.yardopts +5 -0
  4. data/CHANGELOG.md +64 -0
  5. data/README.md +107 -6
  6. data/Rakefile +243 -10
  7. data/db/migrate/001_enable_extensions.rb +1 -0
  8. data/db/migrate/002_create_sources.rb +49 -0
  9. data/db/migrate/003_create_entities.rb +27 -15
  10. data/db/migrate/004_create_entity_aliases.rb +20 -7
  11. data/db/migrate/005_create_facts.rb +37 -21
  12. data/db/migrate/006_create_entity_mentions.rb +14 -6
  13. data/db/migrate/007_create_fact_sources.rb +16 -8
  14. data/docs/api/extractors/index.md +5 -5
  15. data/docs/api/extractors/llm.md +17 -17
  16. data/docs/api/extractors/rule-based.md +14 -14
  17. data/docs/api/facts.md +20 -20
  18. data/docs/api/index.md +4 -4
  19. data/docs/api/models/entity.md +21 -21
  20. data/docs/api/models/fact.md +15 -15
  21. data/docs/api/models/index.md +7 -7
  22. data/docs/api/models/{content.md → source.md} +29 -29
  23. data/docs/api/pipeline/extraction.md +25 -25
  24. data/docs/api/pipeline/index.md +1 -1
  25. data/docs/api/pipeline/resolution.md +4 -4
  26. data/docs/api/services/entity-service.md +20 -20
  27. data/docs/api/services/fact-service.md +12 -12
  28. data/docs/api/services/index.md +5 -5
  29. data/docs/api/services/{content-service.md → source-service.md} +27 -27
  30. data/docs/architecture/database-schema.md +46 -46
  31. data/docs/architecture/entity-resolution.md +6 -6
  32. data/docs/architecture/index.md +10 -10
  33. data/docs/architecture/temporal-facts.md +5 -5
  34. data/docs/architecture/three-layer-model.md +17 -17
  35. data/docs/concepts.md +6 -6
  36. data/docs/examples/basic-usage.md +20 -20
  37. data/docs/examples/hr-onboarding.md +17 -17
  38. data/docs/examples/index.md +4 -4
  39. data/docs/examples/news-analysis.md +23 -23
  40. data/docs/getting-started/database-setup.md +28 -20
  41. data/docs/getting-started/index.md +3 -3
  42. data/docs/getting-started/quick-start.md +33 -30
  43. data/docs/guides/batch-processing.md +26 -26
  44. data/docs/guides/configuration.md +158 -77
  45. data/docs/guides/entity-management.md +14 -14
  46. data/docs/guides/extracting-facts.md +28 -28
  47. data/docs/guides/ingesting-content.md +14 -14
  48. data/docs/guides/llm-integration.md +40 -32
  49. data/docs/guides/temporal-queries.md +11 -11
  50. data/docs/index.md +6 -2
  51. data/examples/.envrc +4 -0
  52. data/examples/.gitignore +1 -0
  53. data/examples/001_configuration.rb +312 -0
  54. data/examples/{basic_usage.rb → 010_basic_usage.rb} +47 -56
  55. data/examples/{entity_management.rb → 020_entity_management.rb} +57 -72
  56. data/examples/{temporal_queries.rb → 030_temporal_queries.rb} +39 -59
  57. data/examples/040_output_formats.rb +177 -0
  58. data/examples/{rule_based_extraction.rb → 050_rule_based_extraction.rb} +39 -45
  59. data/examples/060_fluent_temporal_api.rb +217 -0
  60. data/examples/070_introspection.rb +252 -0
  61. data/examples/{hr_system.rb → 080_hr_system.rb} +56 -75
  62. data/examples/090_ingest_demo.rb +515 -0
  63. data/examples/100_query_context.rb +668 -0
  64. data/examples/110_prove_it.rb +204 -0
  65. data/examples/120_dump_database.rb +358 -0
  66. data/examples/130_rag_feedback_loop.rb +858 -0
  67. data/examples/README.md +229 -15
  68. data/examples/data/lincoln_associates.md +201 -0
  69. data/examples/data/lincoln_biography.md +66 -0
  70. data/examples/data/lincoln_cabinet.md +243 -0
  71. data/examples/data/lincoln_family.md +163 -0
  72. data/examples/data/lincoln_military.md +241 -0
  73. data/examples/data/lincoln_todd_family.md +136 -0
  74. data/examples/ingest_reporter.rb +335 -0
  75. data/examples/utilities.rb +182 -0
  76. data/lib/fact_db/config/defaults.yml +254 -0
  77. data/lib/fact_db/config.rb +94 -35
  78. data/lib/fact_db/database.rb +98 -8
  79. data/lib/fact_db/extractors/base.rb +106 -21
  80. data/lib/fact_db/extractors/llm_extractor.rb +35 -63
  81. data/lib/fact_db/extractors/manual_extractor.rb +46 -6
  82. data/lib/fact_db/extractors/rule_based_extractor.rb +136 -25
  83. data/lib/fact_db/llm/adapter.rb +3 -3
  84. data/lib/fact_db/models/entity.rb +94 -22
  85. data/lib/fact_db/models/entity_alias.rb +41 -7
  86. data/lib/fact_db/models/entity_mention.rb +34 -1
  87. data/lib/fact_db/models/fact.rb +259 -28
  88. data/lib/fact_db/models/fact_source.rb +43 -9
  89. data/lib/fact_db/models/source.rb +113 -0
  90. data/lib/fact_db/pipeline/extraction_pipeline.rb +35 -35
  91. data/lib/fact_db/pipeline/resolution_pipeline.rb +5 -5
  92. data/lib/fact_db/query_result.rb +202 -0
  93. data/lib/fact_db/resolution/entity_resolver.rb +139 -39
  94. data/lib/fact_db/resolution/fact_resolver.rb +86 -14
  95. data/lib/fact_db/services/entity_service.rb +246 -37
  96. data/lib/fact_db/services/fact_service.rb +254 -17
  97. data/lib/fact_db/services/source_service.rb +164 -0
  98. data/lib/fact_db/temporal/query.rb +71 -7
  99. data/lib/fact_db/temporal/query_builder.rb +69 -0
  100. data/lib/fact_db/temporal/timeline.rb +102 -11
  101. data/lib/fact_db/transformers/base.rb +77 -0
  102. data/lib/fact_db/transformers/cypher_transformer.rb +185 -0
  103. data/lib/fact_db/transformers/json_transformer.rb +17 -0
  104. data/lib/fact_db/transformers/raw_transformer.rb +35 -0
  105. data/lib/fact_db/transformers/text_transformer.rb +114 -0
  106. data/lib/fact_db/transformers/triple_transformer.rb +138 -0
  107. data/lib/fact_db/validation/alias_filter.rb +185 -0
  108. data/lib/fact_db/version.rb +1 -1
  109. data/lib/fact_db.rb +281 -30
  110. data/mkdocs.yml +2 -2
  111. metadata +60 -16
  112. data/db/migrate/002_create_contents.rb +0 -44
  113. data/lib/fact_db/models/content.rb +0 -62
  114. data/lib/fact_db/services/content_service.rb +0 -93
@@ -0,0 +1,35 @@
1
+ # frozen_string_literal: true
2
+
3
+ module FactDb
4
+ module Transformers
5
+ # Returns raw ActiveRecord objects without transformation.
6
+ #
7
+ # Use this format when you need direct access to the database objects,
8
+ # such as when you want to:
9
+ # - Access ActiveRecord associations (entity_mentions, fact_sources)
10
+ # - Perform additional database queries on the results
11
+ # - Use ActiveRecord methods like update, destroy, or reload
12
+ # - Chain additional scopes or queries
13
+ #
14
+ # @example Basic usage
15
+ # results = facts.query_facts(topic: "Paula Chen", format: :raw)
16
+ # results.each do |fact|
17
+ # puts fact.text
18
+ # fact.entity_mentions.each { |m| puts m.entity.name }
19
+ # end
20
+ #
21
+ # @example Chaining queries
22
+ # results = facts.query_facts(topic: "Microsoft", format: :raw)
23
+ # recent = results.select { |f| f.valid_at > 1.month.ago }
24
+ #
25
+ class RawTransformer < Base
26
+ # Return raw results without transformation.
27
+ #
28
+ # @param results [QueryResult] The query results
29
+ # @return [Array<FactDb::Models::Fact>] Original ActiveRecord Fact objects
30
+ def transform(results)
31
+ results.raw_facts
32
+ end
33
+ end
34
+ end
35
+ end
@@ -0,0 +1,114 @@
1
+ # frozen_string_literal: true
2
+
3
+ module FactDb
4
+ module Transformers
5
+ # Transforms results into human-readable text format.
6
+ # Useful for direct LLM consumption or debugging.
7
+ class TextTransformer < Base
8
+ # Transform results to text format.
9
+ #
10
+ # @param results [QueryResult] The query results
11
+ # @return [String] Human-readable text
12
+ def transform(results)
13
+ sections = []
14
+
15
+ # Entities section
16
+ sections << format_entities_section(results) unless results.entities.empty?
17
+
18
+ # Facts section
19
+ sections << format_facts_section(results) unless results.facts.empty?
20
+
21
+ if sections.empty?
22
+ "No results found for query: #{results.query}"
23
+ else
24
+ sections.join("\n\n")
25
+ end
26
+ end
27
+
28
+ private
29
+
30
+ def format_entities_section(results)
31
+ lines = ["## Entities"]
32
+
33
+ results.each_entity do |entity|
34
+ name = get_value(entity, :name)
35
+ entity_kind = get_value(entity, :kind)
36
+
37
+ line = "- **#{name}**"
38
+ line += " (#{entity_kind})" if entity_kind
39
+
40
+ aliases = get_value(entity, :aliases)
41
+ if aliases && !aliases.empty?
42
+ alias_texts = aliases.map { |a| a.is_a?(Hash) ? a[:name] : a.to_s }
43
+ line += " - also known as: #{alias_texts.join(', ')}"
44
+ end
45
+
46
+ lines << line
47
+ end
48
+
49
+ lines.join("\n")
50
+ end
51
+
52
+ def format_facts_section(results)
53
+ lines = ["## Facts"]
54
+
55
+ # Group by status
56
+ facts_by_status = results.facts.group_by { |f| get_value(f, :status) || "unknown" }
57
+
58
+ # Show canonical facts first
59
+ if facts_by_status["canonical"]
60
+ lines << "\n### Current Facts"
61
+ facts_by_status["canonical"].each do |fact|
62
+ lines << format_fact(fact, results.entities)
63
+ end
64
+ end
65
+
66
+ # Show corroborated facts
67
+ if facts_by_status["corroborated"]
68
+ lines << "\n### Corroborated Facts"
69
+ facts_by_status["corroborated"].each do |fact|
70
+ lines << format_fact(fact, results.entities)
71
+ end
72
+ end
73
+
74
+ # Show superseded facts (historical)
75
+ if facts_by_status["superseded"]
76
+ lines << "\n### Historical Facts (Superseded)"
77
+ facts_by_status["superseded"].each do |fact|
78
+ lines << format_fact(fact, results.entities)
79
+ end
80
+ end
81
+
82
+ # Show synthesized facts
83
+ if facts_by_status["synthesized"]
84
+ lines << "\n### Synthesized Facts"
85
+ facts_by_status["synthesized"].each do |fact|
86
+ lines << format_fact(fact, results.entities)
87
+ end
88
+ end
89
+
90
+ lines.join("\n")
91
+ end
92
+
93
+ def format_fact(fact, _entities)
94
+ text = get_value(fact, :text)
95
+ valid_at = get_value(fact, :valid_at)
96
+ invalid_at = get_value(fact, :invalid_at)
97
+ confidence = get_value(fact, :confidence)
98
+
99
+ line = "- #{text}"
100
+
101
+ # Add temporal info
102
+ temporal = []
103
+ temporal << "from #{format_date(valid_at)}" if valid_at
104
+ temporal << "until #{format_date(invalid_at)}" if invalid_at
105
+ line += " (#{temporal.join(' ')})" unless temporal.empty?
106
+
107
+ # Add confidence
108
+ line += " [confidence: #{(confidence * 100).round}%]" if confidence
109
+
110
+ line
111
+ end
112
+ end
113
+ end
114
+ end
@@ -0,0 +1,138 @@
1
+ # frozen_string_literal: true
2
+
3
+ module FactDb
4
+ module Transformers
5
+ # Transforms results into Subject-Predicate-Object triples.
6
+ # This format encodes semantic structure that LLMs can leverage.
7
+ #
8
+ # @example Output format
9
+ # [
10
+ # ["Paula Chen", "type", "Person"],
11
+ # ["Paula Chen", "works_at", "Microsoft"],
12
+ # ["Paula Chen", "works_at.valid_from", "2024-01-10"]
13
+ # ]
14
+ class TripleTransformer < Base
15
+ # Transform results to triples format.
16
+ #
17
+ # @param results [QueryResult] The query results
18
+ # @return [Array<Array>] Array of [subject, predicate, object] triples
19
+ def transform(results)
20
+ triples = []
21
+
22
+ # Transform entities
23
+ results.each_entity do |entity|
24
+ triples += entity_to_triples(entity)
25
+ end
26
+
27
+ # Transform facts
28
+ results.each_fact do |fact|
29
+ triples += fact_to_triples(fact, results.entities)
30
+ end
31
+
32
+ triples
33
+ end
34
+
35
+ private
36
+
37
+ def entity_to_triples(entity)
38
+ triples = []
39
+ name = get_value(entity, :name)
40
+ return triples unless name
41
+
42
+ # Kind triple
43
+ entity_kind = get_value(entity, :kind)
44
+ triples << [name, "kind", entity_kind.to_s.capitalize] if entity_kind
45
+
46
+ # Aliases
47
+ aliases = get_value(entity, :aliases) || []
48
+ aliases.each do |aka|
49
+ alias_name = aka.is_a?(Hash) ? aka[:name] : aka.to_s
50
+ triples << [name, "also_known_as", alias_name]
51
+ end
52
+
53
+ # Resolution status
54
+ status = get_value(entity, :resolution_status)
55
+ triples << [name, "resolution_status", status] if status
56
+
57
+ triples
58
+ end
59
+
60
+ def fact_to_triples(fact, entities)
61
+ triples = []
62
+
63
+ text = get_value(fact, :text)
64
+ return triples unless text
65
+
66
+ # Try to extract subject from entity mentions
67
+ mentions = get_value(fact, :entity_mentions) || []
68
+ subject_mention = mentions.find { |m| get_value(m, :mention_role) == "subject" }
69
+
70
+ if subject_mention
71
+ entity_id = get_value(subject_mention, :entity_id)
72
+ entity = entities[entity_id]
73
+ subject = entity ? get_value(entity, :name) : "Entity_#{entity_id}"
74
+ else
75
+ subject = extract_subject(text)
76
+ end
77
+
78
+ # Main fact assertion
79
+ predicate, object = extract_predicate_object(text, subject)
80
+ triples << [subject, predicate, object]
81
+
82
+ # Temporal metadata
83
+ valid_at = get_value(fact, :valid_at)
84
+ triples << [subject, "#{predicate}.valid_from", format_date(valid_at)] if valid_at
85
+
86
+ invalid_at = get_value(fact, :invalid_at)
87
+ triples << [subject, "#{predicate}.valid_until", format_date(invalid_at)] if invalid_at
88
+
89
+ # Status
90
+ status = get_value(fact, :status)
91
+ triples << [subject, "#{predicate}.status", status] if status
92
+
93
+ # Confidence
94
+ confidence = get_value(fact, :confidence)
95
+ triples << [subject, "#{predicate}.confidence", confidence.to_s] if confidence
96
+
97
+ # Add other entity mentions as relationships
98
+ mentions.each do |mention|
99
+ role = get_value(mention, :mention_role)
100
+ next if role == "subject"
101
+
102
+ entity_id = get_value(mention, :entity_id)
103
+ entity = entities[entity_id]
104
+ entity_name = entity ? get_value(entity, :name) : "Entity_#{entity_id}"
105
+
106
+ triples << [subject, role, entity_name]
107
+ end
108
+
109
+ triples
110
+ end
111
+
112
+ def extract_subject(text)
113
+ words = text.split(/\s+/)
114
+ words.take_while { |w| !w.match?(/^(is|are|was|were|has|have|works|worked)$/i) }.join(" ")
115
+ end
116
+
117
+ def extract_predicate_object(text, subject)
118
+ remainder = text.sub(/^#{Regexp.escape(subject)}\s*/i, "")
119
+
120
+ if (match = remainder.match(/^(is|are|was|were|has|have|works?|worked)\s+(.+)/i))
121
+ verb = match[1].downcase
122
+ object = match[2]
123
+
124
+ predicate = case verb
125
+ when "is", "are", "was", "were" then "is"
126
+ when "has", "have" then "has"
127
+ when "works", "worked" then "works_at"
128
+ else verb
129
+ end
130
+
131
+ [predicate, object]
132
+ else
133
+ ["asserts", remainder]
134
+ end
135
+ end
136
+ end
137
+ end
138
+ end
@@ -0,0 +1,185 @@
1
+ # frozen_string_literal: true
2
+
3
+ module FactDb
4
+ module Validation
5
+ # Filters out invalid aliases such as pronouns, common terms, and generic references.
6
+ # Used by extractors, services, and models to ensure alias quality.
7
+ class AliasFilter
8
+ # English pronouns (subject, object, possessive, reflexive)
9
+ PRONOUNS = %w[
10
+ i me my mine myself
11
+ you your yours yourself yourselves
12
+ he him his himself
13
+ she her hers herself
14
+ it its itself
15
+ we us our ours ourselves
16
+ they them their theirs themselves
17
+ who whom whose
18
+ this that these those
19
+ what which
20
+ one ones
21
+ all any both each either neither none some
22
+ another other others
23
+ ].freeze
24
+
25
+ # Common generic terms that shouldn't be aliases
26
+ GENERIC_TERMS = %w[
27
+ a an the
28
+ man woman person people men women
29
+ boy girl child children
30
+ husband wife brother sister father mother son daughter
31
+ king queen prince princess lord lady
32
+ sir madam mr mrs ms miss dr
33
+ someone something somewhere anyone anything anywhere
34
+ everyone everything everywhere nobody nothing nowhere
35
+ here there
36
+ today yesterday tomorrow
37
+ now then
38
+ ].freeze
39
+
40
+ # Common role/title references that are too generic
41
+ GENERIC_ROLES = %w[
42
+ the\ man the\ woman the\ person the\ people
43
+ a\ man a\ woman a\ person
44
+ this\ man this\ woman this\ person
45
+ that\ man that\ woman that\ person
46
+ the\ king the\ queen the\ lord the\ lady
47
+ the\ brother the\ sister the\ father the\ mother
48
+ the\ husband the\ wife
49
+ the\ boy the\ girl the\ child
50
+ believers disciples apostles
51
+ men greek\ men
52
+ ].freeze
53
+
54
+ # Common first names that are too ambiguous to use as standalone aliases
55
+ # These should only be valid when part of a fuller name
56
+ AMBIGUOUS_FIRST_NAMES = %w[
57
+ simon peter john james paul mark matthew luke andrew philip
58
+ thomas james joseph mary martha elizabeth sarah anna david
59
+ michael robert william richard henry george charles edward
60
+ mary ann jane elizabeth margaret catherine alice
61
+ ].freeze
62
+
63
+ class << self
64
+ # Check if a potential alias is valid
65
+ # @param text [String] The alias text to validate
66
+ # @param name [String, nil] The entity's name (for comparison)
67
+ # @return [Boolean] true if the alias is valid
68
+ def valid?(text, name: nil)
69
+ return false if text.nil?
70
+
71
+ normalized = text.to_s.strip.downcase
72
+
73
+ return false if normalized.empty?
74
+ return false if too_short?(normalized)
75
+ return false if pronoun?(normalized)
76
+ return false if generic_term?(normalized)
77
+ return false if generic_role?(normalized)
78
+ return false if matches_canonical?(normalized, name)
79
+ return false if only_articles_and_generic?(normalized)
80
+ return false if ambiguous_standalone_name?(normalized, name)
81
+
82
+ true
83
+ end
84
+
85
+ # Filter an array of aliases, returning only valid ones
86
+ # @param aliases [Array<String>] Array of potential aliases
87
+ # @param name [String, nil] The entity's name
88
+ # @return [Array<String>] Array of valid aliases
89
+ def filter(aliases, name: nil)
90
+ return [] unless aliases.is_a?(Array)
91
+
92
+ aliases
93
+ .map { |a| a.to_s.strip }
94
+ .reject { |a| a.empty? }
95
+ .select { |a| valid?(a, name: name) }
96
+ .uniq { |a| a.downcase }
97
+ end
98
+
99
+ # Get a human-readable reason why an alias was rejected
100
+ # @param text [String] The alias text
101
+ # @param name [String, nil] The entity's name
102
+ # @return [String, nil] Rejection reason or nil if valid
103
+ def rejection_reason(text, name: nil)
104
+ return "empty or nil" if text.nil? || text.to_s.strip.empty?
105
+
106
+ normalized = text.to_s.strip.downcase
107
+
108
+ return "too short (less than 2 characters)" if too_short?(normalized)
109
+ return "is a pronoun" if pronoun?(normalized)
110
+ return "is a generic term" if generic_term?(normalized)
111
+ return "is a generic role reference" if generic_role?(normalized)
112
+ return "contains only articles and generic words" if only_articles_and_generic?(normalized)
113
+ return "is an ambiguous standalone first name" if ambiguous_standalone_name?(normalized, name)
114
+
115
+ nil
116
+ end
117
+
118
+ private
119
+
120
+ def too_short?(text)
121
+ # Single characters are almost never valid aliases
122
+ # Exception: single uppercase letters could be initials
123
+ text.length < 2
124
+ end
125
+
126
+ def pronoun?(text)
127
+ PRONOUNS.include?(text)
128
+ end
129
+
130
+ def generic_term?(text)
131
+ GENERIC_TERMS.include?(text)
132
+ end
133
+
134
+ def generic_role?(text)
135
+ GENERIC_ROLES.include?(text)
136
+ end
137
+
138
+ def matches_canonical?(text, canonical_name)
139
+ return false if canonical_name.nil?
140
+
141
+ text == canonical_name.to_s.strip.downcase
142
+ end
143
+
144
+ def only_articles_and_generic?(text)
145
+ words = text.split(/\s+/)
146
+ return false if words.empty?
147
+
148
+ # Check if all words are articles or generic terms
149
+ filler_words = %w[a an the this that these those of and or]
150
+ non_filler = words.reject { |w| filler_words.include?(w) || GENERIC_TERMS.include?(w) }
151
+
152
+ non_filler.empty? || non_filler.all? { |w| PRONOUNS.include?(w) }
153
+ end
154
+
155
+ # Check if text is a standalone ambiguous first name
156
+ # Single common first names are too likely to cause entity confusion
157
+ # But "Simon Peter" or "John Mark" would be acceptable
158
+ def ambiguous_standalone_name?(text, canonical_name)
159
+ return false if text.nil?
160
+
161
+ words = text.split(/\s+/)
162
+
163
+ # Only reject if it's a single word that's a common first name
164
+ return false unless words.length == 1
165
+
166
+ # Check if it's in our list of ambiguous first names
167
+ return false unless AMBIGUOUS_FIRST_NAMES.include?(text)
168
+
169
+ # Allow if the canonical name is essentially the same
170
+ # (e.g., "Peter" as alias for "Peter" entity)
171
+ return false if canonical_name && canonical_name.to_s.strip.downcase == text
172
+
173
+ # Allow if the first name matches the first word of canonical name
174
+ # (e.g., "Simon" for "Simon Peter" is ok, but "Simon" for "Jesus" is not)
175
+ if canonical_name
176
+ canonical_first = canonical_name.to_s.strip.downcase.split(/\s+/).first
177
+ return false if canonical_first == text
178
+ end
179
+
180
+ true
181
+ end
182
+ end
183
+ end
184
+ end
185
+ end
@@ -1,5 +1,5 @@
1
1
  # frozen_string_literal: true
2
2
 
3
3
  module FactDb
4
- VERSION = "0.0.2"
4
+ VERSION = "0.0.3"
5
5
  end