fact_db 0.0.2 → 0.0.3
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/.envrc +2 -0
- data/.yardopts +5 -0
- data/CHANGELOG.md +64 -0
- data/README.md +107 -6
- data/Rakefile +243 -10
- data/db/migrate/001_enable_extensions.rb +1 -0
- data/db/migrate/002_create_sources.rb +49 -0
- data/db/migrate/003_create_entities.rb +27 -15
- data/db/migrate/004_create_entity_aliases.rb +20 -7
- data/db/migrate/005_create_facts.rb +37 -21
- data/db/migrate/006_create_entity_mentions.rb +14 -6
- data/db/migrate/007_create_fact_sources.rb +16 -8
- data/docs/api/extractors/index.md +5 -5
- data/docs/api/extractors/llm.md +17 -17
- data/docs/api/extractors/rule-based.md +14 -14
- data/docs/api/facts.md +20 -20
- data/docs/api/index.md +4 -4
- data/docs/api/models/entity.md +21 -21
- data/docs/api/models/fact.md +15 -15
- data/docs/api/models/index.md +7 -7
- data/docs/api/models/{content.md → source.md} +29 -29
- data/docs/api/pipeline/extraction.md +25 -25
- data/docs/api/pipeline/index.md +1 -1
- data/docs/api/pipeline/resolution.md +4 -4
- data/docs/api/services/entity-service.md +20 -20
- data/docs/api/services/fact-service.md +12 -12
- data/docs/api/services/index.md +5 -5
- data/docs/api/services/{content-service.md → source-service.md} +27 -27
- data/docs/architecture/database-schema.md +46 -46
- data/docs/architecture/entity-resolution.md +6 -6
- data/docs/architecture/index.md +10 -10
- data/docs/architecture/temporal-facts.md +5 -5
- data/docs/architecture/three-layer-model.md +17 -17
- data/docs/concepts.md +6 -6
- data/docs/examples/basic-usage.md +20 -20
- data/docs/examples/hr-onboarding.md +17 -17
- data/docs/examples/index.md +4 -4
- data/docs/examples/news-analysis.md +23 -23
- data/docs/getting-started/database-setup.md +28 -20
- data/docs/getting-started/index.md +3 -3
- data/docs/getting-started/quick-start.md +33 -30
- data/docs/guides/batch-processing.md +26 -26
- data/docs/guides/configuration.md +158 -77
- data/docs/guides/entity-management.md +14 -14
- data/docs/guides/extracting-facts.md +28 -28
- data/docs/guides/ingesting-content.md +14 -14
- data/docs/guides/llm-integration.md +40 -32
- data/docs/guides/temporal-queries.md +11 -11
- data/docs/index.md +6 -2
- data/examples/.envrc +4 -0
- data/examples/.gitignore +1 -0
- data/examples/001_configuration.rb +312 -0
- data/examples/{basic_usage.rb → 010_basic_usage.rb} +47 -56
- data/examples/{entity_management.rb → 020_entity_management.rb} +57 -72
- data/examples/{temporal_queries.rb → 030_temporal_queries.rb} +39 -59
- data/examples/040_output_formats.rb +177 -0
- data/examples/{rule_based_extraction.rb → 050_rule_based_extraction.rb} +39 -45
- data/examples/060_fluent_temporal_api.rb +217 -0
- data/examples/070_introspection.rb +252 -0
- data/examples/{hr_system.rb → 080_hr_system.rb} +56 -75
- data/examples/090_ingest_demo.rb +515 -0
- data/examples/100_query_context.rb +668 -0
- data/examples/110_prove_it.rb +204 -0
- data/examples/120_dump_database.rb +358 -0
- data/examples/130_rag_feedback_loop.rb +858 -0
- data/examples/README.md +229 -15
- data/examples/data/lincoln_associates.md +201 -0
- data/examples/data/lincoln_biography.md +66 -0
- data/examples/data/lincoln_cabinet.md +243 -0
- data/examples/data/lincoln_family.md +163 -0
- data/examples/data/lincoln_military.md +241 -0
- data/examples/data/lincoln_todd_family.md +136 -0
- data/examples/ingest_reporter.rb +335 -0
- data/examples/utilities.rb +182 -0
- data/lib/fact_db/config/defaults.yml +254 -0
- data/lib/fact_db/config.rb +94 -35
- data/lib/fact_db/database.rb +98 -8
- data/lib/fact_db/extractors/base.rb +106 -21
- data/lib/fact_db/extractors/llm_extractor.rb +35 -63
- data/lib/fact_db/extractors/manual_extractor.rb +46 -6
- data/lib/fact_db/extractors/rule_based_extractor.rb +136 -25
- data/lib/fact_db/llm/adapter.rb +3 -3
- data/lib/fact_db/models/entity.rb +94 -22
- data/lib/fact_db/models/entity_alias.rb +41 -7
- data/lib/fact_db/models/entity_mention.rb +34 -1
- data/lib/fact_db/models/fact.rb +259 -28
- data/lib/fact_db/models/fact_source.rb +43 -9
- data/lib/fact_db/models/source.rb +113 -0
- data/lib/fact_db/pipeline/extraction_pipeline.rb +35 -35
- data/lib/fact_db/pipeline/resolution_pipeline.rb +5 -5
- data/lib/fact_db/query_result.rb +202 -0
- data/lib/fact_db/resolution/entity_resolver.rb +139 -39
- data/lib/fact_db/resolution/fact_resolver.rb +86 -14
- data/lib/fact_db/services/entity_service.rb +246 -37
- data/lib/fact_db/services/fact_service.rb +254 -17
- data/lib/fact_db/services/source_service.rb +164 -0
- data/lib/fact_db/temporal/query.rb +71 -7
- data/lib/fact_db/temporal/query_builder.rb +69 -0
- data/lib/fact_db/temporal/timeline.rb +102 -11
- data/lib/fact_db/transformers/base.rb +77 -0
- data/lib/fact_db/transformers/cypher_transformer.rb +185 -0
- data/lib/fact_db/transformers/json_transformer.rb +17 -0
- data/lib/fact_db/transformers/raw_transformer.rb +35 -0
- data/lib/fact_db/transformers/text_transformer.rb +114 -0
- data/lib/fact_db/transformers/triple_transformer.rb +138 -0
- data/lib/fact_db/validation/alias_filter.rb +185 -0
- data/lib/fact_db/version.rb +1 -1
- data/lib/fact_db.rb +281 -30
- data/mkdocs.yml +2 -2
- metadata +60 -16
- data/db/migrate/002_create_contents.rb +0 -44
- data/lib/fact_db/models/content.rb +0 -62
- data/lib/fact_db/services/content_service.rb +0 -93
|
@@ -0,0 +1,35 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
module FactDb
|
|
4
|
+
module Transformers
|
|
5
|
+
# Returns raw ActiveRecord objects without transformation.
|
|
6
|
+
#
|
|
7
|
+
# Use this format when you need direct access to the database objects,
|
|
8
|
+
# such as when you want to:
|
|
9
|
+
# - Access ActiveRecord associations (entity_mentions, fact_sources)
|
|
10
|
+
# - Perform additional database queries on the results
|
|
11
|
+
# - Use ActiveRecord methods like update, destroy, or reload
|
|
12
|
+
# - Chain additional scopes or queries
|
|
13
|
+
#
|
|
14
|
+
# @example Basic usage
|
|
15
|
+
# results = facts.query_facts(topic: "Paula Chen", format: :raw)
|
|
16
|
+
# results.each do |fact|
|
|
17
|
+
# puts fact.text
|
|
18
|
+
# fact.entity_mentions.each { |m| puts m.entity.name }
|
|
19
|
+
# end
|
|
20
|
+
#
|
|
21
|
+
# @example Chaining queries
|
|
22
|
+
# results = facts.query_facts(topic: "Microsoft", format: :raw)
|
|
23
|
+
# recent = results.select { |f| f.valid_at > 1.month.ago }
|
|
24
|
+
#
|
|
25
|
+
class RawTransformer < Base
|
|
26
|
+
# Return raw results without transformation.
|
|
27
|
+
#
|
|
28
|
+
# @param results [QueryResult] The query results
|
|
29
|
+
# @return [Array<FactDb::Models::Fact>] Original ActiveRecord Fact objects
|
|
30
|
+
def transform(results)
|
|
31
|
+
results.raw_facts
|
|
32
|
+
end
|
|
33
|
+
end
|
|
34
|
+
end
|
|
35
|
+
end
|
|
@@ -0,0 +1,114 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
module FactDb
|
|
4
|
+
module Transformers
|
|
5
|
+
# Transforms results into human-readable text format.
|
|
6
|
+
# Useful for direct LLM consumption or debugging.
|
|
7
|
+
class TextTransformer < Base
|
|
8
|
+
# Transform results to text format.
|
|
9
|
+
#
|
|
10
|
+
# @param results [QueryResult] The query results
|
|
11
|
+
# @return [String] Human-readable text
|
|
12
|
+
def transform(results)
|
|
13
|
+
sections = []
|
|
14
|
+
|
|
15
|
+
# Entities section
|
|
16
|
+
sections << format_entities_section(results) unless results.entities.empty?
|
|
17
|
+
|
|
18
|
+
# Facts section
|
|
19
|
+
sections << format_facts_section(results) unless results.facts.empty?
|
|
20
|
+
|
|
21
|
+
if sections.empty?
|
|
22
|
+
"No results found for query: #{results.query}"
|
|
23
|
+
else
|
|
24
|
+
sections.join("\n\n")
|
|
25
|
+
end
|
|
26
|
+
end
|
|
27
|
+
|
|
28
|
+
private
|
|
29
|
+
|
|
30
|
+
def format_entities_section(results)
|
|
31
|
+
lines = ["## Entities"]
|
|
32
|
+
|
|
33
|
+
results.each_entity do |entity|
|
|
34
|
+
name = get_value(entity, :name)
|
|
35
|
+
entity_kind = get_value(entity, :kind)
|
|
36
|
+
|
|
37
|
+
line = "- **#{name}**"
|
|
38
|
+
line += " (#{entity_kind})" if entity_kind
|
|
39
|
+
|
|
40
|
+
aliases = get_value(entity, :aliases)
|
|
41
|
+
if aliases && !aliases.empty?
|
|
42
|
+
alias_texts = aliases.map { |a| a.is_a?(Hash) ? a[:name] : a.to_s }
|
|
43
|
+
line += " - also known as: #{alias_texts.join(', ')}"
|
|
44
|
+
end
|
|
45
|
+
|
|
46
|
+
lines << line
|
|
47
|
+
end
|
|
48
|
+
|
|
49
|
+
lines.join("\n")
|
|
50
|
+
end
|
|
51
|
+
|
|
52
|
+
def format_facts_section(results)
|
|
53
|
+
lines = ["## Facts"]
|
|
54
|
+
|
|
55
|
+
# Group by status
|
|
56
|
+
facts_by_status = results.facts.group_by { |f| get_value(f, :status) || "unknown" }
|
|
57
|
+
|
|
58
|
+
# Show canonical facts first
|
|
59
|
+
if facts_by_status["canonical"]
|
|
60
|
+
lines << "\n### Current Facts"
|
|
61
|
+
facts_by_status["canonical"].each do |fact|
|
|
62
|
+
lines << format_fact(fact, results.entities)
|
|
63
|
+
end
|
|
64
|
+
end
|
|
65
|
+
|
|
66
|
+
# Show corroborated facts
|
|
67
|
+
if facts_by_status["corroborated"]
|
|
68
|
+
lines << "\n### Corroborated Facts"
|
|
69
|
+
facts_by_status["corroborated"].each do |fact|
|
|
70
|
+
lines << format_fact(fact, results.entities)
|
|
71
|
+
end
|
|
72
|
+
end
|
|
73
|
+
|
|
74
|
+
# Show superseded facts (historical)
|
|
75
|
+
if facts_by_status["superseded"]
|
|
76
|
+
lines << "\n### Historical Facts (Superseded)"
|
|
77
|
+
facts_by_status["superseded"].each do |fact|
|
|
78
|
+
lines << format_fact(fact, results.entities)
|
|
79
|
+
end
|
|
80
|
+
end
|
|
81
|
+
|
|
82
|
+
# Show synthesized facts
|
|
83
|
+
if facts_by_status["synthesized"]
|
|
84
|
+
lines << "\n### Synthesized Facts"
|
|
85
|
+
facts_by_status["synthesized"].each do |fact|
|
|
86
|
+
lines << format_fact(fact, results.entities)
|
|
87
|
+
end
|
|
88
|
+
end
|
|
89
|
+
|
|
90
|
+
lines.join("\n")
|
|
91
|
+
end
|
|
92
|
+
|
|
93
|
+
def format_fact(fact, _entities)
|
|
94
|
+
text = get_value(fact, :text)
|
|
95
|
+
valid_at = get_value(fact, :valid_at)
|
|
96
|
+
invalid_at = get_value(fact, :invalid_at)
|
|
97
|
+
confidence = get_value(fact, :confidence)
|
|
98
|
+
|
|
99
|
+
line = "- #{text}"
|
|
100
|
+
|
|
101
|
+
# Add temporal info
|
|
102
|
+
temporal = []
|
|
103
|
+
temporal << "from #{format_date(valid_at)}" if valid_at
|
|
104
|
+
temporal << "until #{format_date(invalid_at)}" if invalid_at
|
|
105
|
+
line += " (#{temporal.join(' ')})" unless temporal.empty?
|
|
106
|
+
|
|
107
|
+
# Add confidence
|
|
108
|
+
line += " [confidence: #{(confidence * 100).round}%]" if confidence
|
|
109
|
+
|
|
110
|
+
line
|
|
111
|
+
end
|
|
112
|
+
end
|
|
113
|
+
end
|
|
114
|
+
end
|
|
@@ -0,0 +1,138 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
module FactDb
|
|
4
|
+
module Transformers
|
|
5
|
+
# Transforms results into Subject-Predicate-Object triples.
|
|
6
|
+
# This format encodes semantic structure that LLMs can leverage.
|
|
7
|
+
#
|
|
8
|
+
# @example Output format
|
|
9
|
+
# [
|
|
10
|
+
# ["Paula Chen", "type", "Person"],
|
|
11
|
+
# ["Paula Chen", "works_at", "Microsoft"],
|
|
12
|
+
# ["Paula Chen", "works_at.valid_from", "2024-01-10"]
|
|
13
|
+
# ]
|
|
14
|
+
class TripleTransformer < Base
|
|
15
|
+
# Transform results to triples format.
|
|
16
|
+
#
|
|
17
|
+
# @param results [QueryResult] The query results
|
|
18
|
+
# @return [Array<Array>] Array of [subject, predicate, object] triples
|
|
19
|
+
def transform(results)
|
|
20
|
+
triples = []
|
|
21
|
+
|
|
22
|
+
# Transform entities
|
|
23
|
+
results.each_entity do |entity|
|
|
24
|
+
triples += entity_to_triples(entity)
|
|
25
|
+
end
|
|
26
|
+
|
|
27
|
+
# Transform facts
|
|
28
|
+
results.each_fact do |fact|
|
|
29
|
+
triples += fact_to_triples(fact, results.entities)
|
|
30
|
+
end
|
|
31
|
+
|
|
32
|
+
triples
|
|
33
|
+
end
|
|
34
|
+
|
|
35
|
+
private
|
|
36
|
+
|
|
37
|
+
def entity_to_triples(entity)
|
|
38
|
+
triples = []
|
|
39
|
+
name = get_value(entity, :name)
|
|
40
|
+
return triples unless name
|
|
41
|
+
|
|
42
|
+
# Kind triple
|
|
43
|
+
entity_kind = get_value(entity, :kind)
|
|
44
|
+
triples << [name, "kind", entity_kind.to_s.capitalize] if entity_kind
|
|
45
|
+
|
|
46
|
+
# Aliases
|
|
47
|
+
aliases = get_value(entity, :aliases) || []
|
|
48
|
+
aliases.each do |aka|
|
|
49
|
+
alias_name = aka.is_a?(Hash) ? aka[:name] : aka.to_s
|
|
50
|
+
triples << [name, "also_known_as", alias_name]
|
|
51
|
+
end
|
|
52
|
+
|
|
53
|
+
# Resolution status
|
|
54
|
+
status = get_value(entity, :resolution_status)
|
|
55
|
+
triples << [name, "resolution_status", status] if status
|
|
56
|
+
|
|
57
|
+
triples
|
|
58
|
+
end
|
|
59
|
+
|
|
60
|
+
def fact_to_triples(fact, entities)
|
|
61
|
+
triples = []
|
|
62
|
+
|
|
63
|
+
text = get_value(fact, :text)
|
|
64
|
+
return triples unless text
|
|
65
|
+
|
|
66
|
+
# Try to extract subject from entity mentions
|
|
67
|
+
mentions = get_value(fact, :entity_mentions) || []
|
|
68
|
+
subject_mention = mentions.find { |m| get_value(m, :mention_role) == "subject" }
|
|
69
|
+
|
|
70
|
+
if subject_mention
|
|
71
|
+
entity_id = get_value(subject_mention, :entity_id)
|
|
72
|
+
entity = entities[entity_id]
|
|
73
|
+
subject = entity ? get_value(entity, :name) : "Entity_#{entity_id}"
|
|
74
|
+
else
|
|
75
|
+
subject = extract_subject(text)
|
|
76
|
+
end
|
|
77
|
+
|
|
78
|
+
# Main fact assertion
|
|
79
|
+
predicate, object = extract_predicate_object(text, subject)
|
|
80
|
+
triples << [subject, predicate, object]
|
|
81
|
+
|
|
82
|
+
# Temporal metadata
|
|
83
|
+
valid_at = get_value(fact, :valid_at)
|
|
84
|
+
triples << [subject, "#{predicate}.valid_from", format_date(valid_at)] if valid_at
|
|
85
|
+
|
|
86
|
+
invalid_at = get_value(fact, :invalid_at)
|
|
87
|
+
triples << [subject, "#{predicate}.valid_until", format_date(invalid_at)] if invalid_at
|
|
88
|
+
|
|
89
|
+
# Status
|
|
90
|
+
status = get_value(fact, :status)
|
|
91
|
+
triples << [subject, "#{predicate}.status", status] if status
|
|
92
|
+
|
|
93
|
+
# Confidence
|
|
94
|
+
confidence = get_value(fact, :confidence)
|
|
95
|
+
triples << [subject, "#{predicate}.confidence", confidence.to_s] if confidence
|
|
96
|
+
|
|
97
|
+
# Add other entity mentions as relationships
|
|
98
|
+
mentions.each do |mention|
|
|
99
|
+
role = get_value(mention, :mention_role)
|
|
100
|
+
next if role == "subject"
|
|
101
|
+
|
|
102
|
+
entity_id = get_value(mention, :entity_id)
|
|
103
|
+
entity = entities[entity_id]
|
|
104
|
+
entity_name = entity ? get_value(entity, :name) : "Entity_#{entity_id}"
|
|
105
|
+
|
|
106
|
+
triples << [subject, role, entity_name]
|
|
107
|
+
end
|
|
108
|
+
|
|
109
|
+
triples
|
|
110
|
+
end
|
|
111
|
+
|
|
112
|
+
def extract_subject(text)
|
|
113
|
+
words = text.split(/\s+/)
|
|
114
|
+
words.take_while { |w| !w.match?(/^(is|are|was|were|has|have|works|worked)$/i) }.join(" ")
|
|
115
|
+
end
|
|
116
|
+
|
|
117
|
+
def extract_predicate_object(text, subject)
|
|
118
|
+
remainder = text.sub(/^#{Regexp.escape(subject)}\s*/i, "")
|
|
119
|
+
|
|
120
|
+
if (match = remainder.match(/^(is|are|was|were|has|have|works?|worked)\s+(.+)/i))
|
|
121
|
+
verb = match[1].downcase
|
|
122
|
+
object = match[2]
|
|
123
|
+
|
|
124
|
+
predicate = case verb
|
|
125
|
+
when "is", "are", "was", "were" then "is"
|
|
126
|
+
when "has", "have" then "has"
|
|
127
|
+
when "works", "worked" then "works_at"
|
|
128
|
+
else verb
|
|
129
|
+
end
|
|
130
|
+
|
|
131
|
+
[predicate, object]
|
|
132
|
+
else
|
|
133
|
+
["asserts", remainder]
|
|
134
|
+
end
|
|
135
|
+
end
|
|
136
|
+
end
|
|
137
|
+
end
|
|
138
|
+
end
|
|
@@ -0,0 +1,185 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
module FactDb
|
|
4
|
+
module Validation
|
|
5
|
+
# Filters out invalid aliases such as pronouns, common terms, and generic references.
|
|
6
|
+
# Used by extractors, services, and models to ensure alias quality.
|
|
7
|
+
class AliasFilter
|
|
8
|
+
# English pronouns (subject, object, possessive, reflexive)
|
|
9
|
+
PRONOUNS = %w[
|
|
10
|
+
i me my mine myself
|
|
11
|
+
you your yours yourself yourselves
|
|
12
|
+
he him his himself
|
|
13
|
+
she her hers herself
|
|
14
|
+
it its itself
|
|
15
|
+
we us our ours ourselves
|
|
16
|
+
they them their theirs themselves
|
|
17
|
+
who whom whose
|
|
18
|
+
this that these those
|
|
19
|
+
what which
|
|
20
|
+
one ones
|
|
21
|
+
all any both each either neither none some
|
|
22
|
+
another other others
|
|
23
|
+
].freeze
|
|
24
|
+
|
|
25
|
+
# Common generic terms that shouldn't be aliases
|
|
26
|
+
GENERIC_TERMS = %w[
|
|
27
|
+
a an the
|
|
28
|
+
man woman person people men women
|
|
29
|
+
boy girl child children
|
|
30
|
+
husband wife brother sister father mother son daughter
|
|
31
|
+
king queen prince princess lord lady
|
|
32
|
+
sir madam mr mrs ms miss dr
|
|
33
|
+
someone something somewhere anyone anything anywhere
|
|
34
|
+
everyone everything everywhere nobody nothing nowhere
|
|
35
|
+
here there
|
|
36
|
+
today yesterday tomorrow
|
|
37
|
+
now then
|
|
38
|
+
].freeze
|
|
39
|
+
|
|
40
|
+
# Common role/title references that are too generic
|
|
41
|
+
GENERIC_ROLES = %w[
|
|
42
|
+
the\ man the\ woman the\ person the\ people
|
|
43
|
+
a\ man a\ woman a\ person
|
|
44
|
+
this\ man this\ woman this\ person
|
|
45
|
+
that\ man that\ woman that\ person
|
|
46
|
+
the\ king the\ queen the\ lord the\ lady
|
|
47
|
+
the\ brother the\ sister the\ father the\ mother
|
|
48
|
+
the\ husband the\ wife
|
|
49
|
+
the\ boy the\ girl the\ child
|
|
50
|
+
believers disciples apostles
|
|
51
|
+
men greek\ men
|
|
52
|
+
].freeze
|
|
53
|
+
|
|
54
|
+
# Common first names that are too ambiguous to use as standalone aliases
|
|
55
|
+
# These should only be valid when part of a fuller name
|
|
56
|
+
AMBIGUOUS_FIRST_NAMES = %w[
|
|
57
|
+
simon peter john james paul mark matthew luke andrew philip
|
|
58
|
+
thomas james joseph mary martha elizabeth sarah anna david
|
|
59
|
+
michael robert william richard henry george charles edward
|
|
60
|
+
mary ann jane elizabeth margaret catherine alice
|
|
61
|
+
].freeze
|
|
62
|
+
|
|
63
|
+
class << self
|
|
64
|
+
# Check if a potential alias is valid
|
|
65
|
+
# @param text [String] The alias text to validate
|
|
66
|
+
# @param name [String, nil] The entity's name (for comparison)
|
|
67
|
+
# @return [Boolean] true if the alias is valid
|
|
68
|
+
def valid?(text, name: nil)
|
|
69
|
+
return false if text.nil?
|
|
70
|
+
|
|
71
|
+
normalized = text.to_s.strip.downcase
|
|
72
|
+
|
|
73
|
+
return false if normalized.empty?
|
|
74
|
+
return false if too_short?(normalized)
|
|
75
|
+
return false if pronoun?(normalized)
|
|
76
|
+
return false if generic_term?(normalized)
|
|
77
|
+
return false if generic_role?(normalized)
|
|
78
|
+
return false if matches_canonical?(normalized, name)
|
|
79
|
+
return false if only_articles_and_generic?(normalized)
|
|
80
|
+
return false if ambiguous_standalone_name?(normalized, name)
|
|
81
|
+
|
|
82
|
+
true
|
|
83
|
+
end
|
|
84
|
+
|
|
85
|
+
# Filter an array of aliases, returning only valid ones
|
|
86
|
+
# @param aliases [Array<String>] Array of potential aliases
|
|
87
|
+
# @param name [String, nil] The entity's name
|
|
88
|
+
# @return [Array<String>] Array of valid aliases
|
|
89
|
+
def filter(aliases, name: nil)
|
|
90
|
+
return [] unless aliases.is_a?(Array)
|
|
91
|
+
|
|
92
|
+
aliases
|
|
93
|
+
.map { |a| a.to_s.strip }
|
|
94
|
+
.reject { |a| a.empty? }
|
|
95
|
+
.select { |a| valid?(a, name: name) }
|
|
96
|
+
.uniq { |a| a.downcase }
|
|
97
|
+
end
|
|
98
|
+
|
|
99
|
+
# Get a human-readable reason why an alias was rejected
|
|
100
|
+
# @param text [String] The alias text
|
|
101
|
+
# @param name [String, nil] The entity's name
|
|
102
|
+
# @return [String, nil] Rejection reason or nil if valid
|
|
103
|
+
def rejection_reason(text, name: nil)
|
|
104
|
+
return "empty or nil" if text.nil? || text.to_s.strip.empty?
|
|
105
|
+
|
|
106
|
+
normalized = text.to_s.strip.downcase
|
|
107
|
+
|
|
108
|
+
return "too short (less than 2 characters)" if too_short?(normalized)
|
|
109
|
+
return "is a pronoun" if pronoun?(normalized)
|
|
110
|
+
return "is a generic term" if generic_term?(normalized)
|
|
111
|
+
return "is a generic role reference" if generic_role?(normalized)
|
|
112
|
+
return "contains only articles and generic words" if only_articles_and_generic?(normalized)
|
|
113
|
+
return "is an ambiguous standalone first name" if ambiguous_standalone_name?(normalized, name)
|
|
114
|
+
|
|
115
|
+
nil
|
|
116
|
+
end
|
|
117
|
+
|
|
118
|
+
private
|
|
119
|
+
|
|
120
|
+
def too_short?(text)
|
|
121
|
+
# Single characters are almost never valid aliases
|
|
122
|
+
# Exception: single uppercase letters could be initials
|
|
123
|
+
text.length < 2
|
|
124
|
+
end
|
|
125
|
+
|
|
126
|
+
def pronoun?(text)
|
|
127
|
+
PRONOUNS.include?(text)
|
|
128
|
+
end
|
|
129
|
+
|
|
130
|
+
def generic_term?(text)
|
|
131
|
+
GENERIC_TERMS.include?(text)
|
|
132
|
+
end
|
|
133
|
+
|
|
134
|
+
def generic_role?(text)
|
|
135
|
+
GENERIC_ROLES.include?(text)
|
|
136
|
+
end
|
|
137
|
+
|
|
138
|
+
def matches_canonical?(text, canonical_name)
|
|
139
|
+
return false if canonical_name.nil?
|
|
140
|
+
|
|
141
|
+
text == canonical_name.to_s.strip.downcase
|
|
142
|
+
end
|
|
143
|
+
|
|
144
|
+
def only_articles_and_generic?(text)
|
|
145
|
+
words = text.split(/\s+/)
|
|
146
|
+
return false if words.empty?
|
|
147
|
+
|
|
148
|
+
# Check if all words are articles or generic terms
|
|
149
|
+
filler_words = %w[a an the this that these those of and or]
|
|
150
|
+
non_filler = words.reject { |w| filler_words.include?(w) || GENERIC_TERMS.include?(w) }
|
|
151
|
+
|
|
152
|
+
non_filler.empty? || non_filler.all? { |w| PRONOUNS.include?(w) }
|
|
153
|
+
end
|
|
154
|
+
|
|
155
|
+
# Check if text is a standalone ambiguous first name
|
|
156
|
+
# Single common first names are too likely to cause entity confusion
|
|
157
|
+
# But "Simon Peter" or "John Mark" would be acceptable
|
|
158
|
+
def ambiguous_standalone_name?(text, canonical_name)
|
|
159
|
+
return false if text.nil?
|
|
160
|
+
|
|
161
|
+
words = text.split(/\s+/)
|
|
162
|
+
|
|
163
|
+
# Only reject if it's a single word that's a common first name
|
|
164
|
+
return false unless words.length == 1
|
|
165
|
+
|
|
166
|
+
# Check if it's in our list of ambiguous first names
|
|
167
|
+
return false unless AMBIGUOUS_FIRST_NAMES.include?(text)
|
|
168
|
+
|
|
169
|
+
# Allow if the canonical name is essentially the same
|
|
170
|
+
# (e.g., "Peter" as alias for "Peter" entity)
|
|
171
|
+
return false if canonical_name && canonical_name.to_s.strip.downcase == text
|
|
172
|
+
|
|
173
|
+
# Allow if the first name matches the first word of canonical name
|
|
174
|
+
# (e.g., "Simon" for "Simon Peter" is ok, but "Simon" for "Jesus" is not)
|
|
175
|
+
if canonical_name
|
|
176
|
+
canonical_first = canonical_name.to_s.strip.downcase.split(/\s+/).first
|
|
177
|
+
return false if canonical_first == text
|
|
178
|
+
end
|
|
179
|
+
|
|
180
|
+
true
|
|
181
|
+
end
|
|
182
|
+
end
|
|
183
|
+
end
|
|
184
|
+
end
|
|
185
|
+
end
|
data/lib/fact_db/version.rb
CHANGED