fact_db 0.0.2 → 0.0.3
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/.envrc +2 -0
- data/.yardopts +5 -0
- data/CHANGELOG.md +64 -0
- data/README.md +107 -6
- data/Rakefile +243 -10
- data/db/migrate/001_enable_extensions.rb +1 -0
- data/db/migrate/002_create_sources.rb +49 -0
- data/db/migrate/003_create_entities.rb +27 -15
- data/db/migrate/004_create_entity_aliases.rb +20 -7
- data/db/migrate/005_create_facts.rb +37 -21
- data/db/migrate/006_create_entity_mentions.rb +14 -6
- data/db/migrate/007_create_fact_sources.rb +16 -8
- data/docs/api/extractors/index.md +5 -5
- data/docs/api/extractors/llm.md +17 -17
- data/docs/api/extractors/rule-based.md +14 -14
- data/docs/api/facts.md +20 -20
- data/docs/api/index.md +4 -4
- data/docs/api/models/entity.md +21 -21
- data/docs/api/models/fact.md +15 -15
- data/docs/api/models/index.md +7 -7
- data/docs/api/models/{content.md → source.md} +29 -29
- data/docs/api/pipeline/extraction.md +25 -25
- data/docs/api/pipeline/index.md +1 -1
- data/docs/api/pipeline/resolution.md +4 -4
- data/docs/api/services/entity-service.md +20 -20
- data/docs/api/services/fact-service.md +12 -12
- data/docs/api/services/index.md +5 -5
- data/docs/api/services/{content-service.md → source-service.md} +27 -27
- data/docs/architecture/database-schema.md +46 -46
- data/docs/architecture/entity-resolution.md +6 -6
- data/docs/architecture/index.md +10 -10
- data/docs/architecture/temporal-facts.md +5 -5
- data/docs/architecture/three-layer-model.md +17 -17
- data/docs/concepts.md +6 -6
- data/docs/examples/basic-usage.md +20 -20
- data/docs/examples/hr-onboarding.md +17 -17
- data/docs/examples/index.md +4 -4
- data/docs/examples/news-analysis.md +23 -23
- data/docs/getting-started/database-setup.md +28 -20
- data/docs/getting-started/index.md +3 -3
- data/docs/getting-started/quick-start.md +33 -30
- data/docs/guides/batch-processing.md +26 -26
- data/docs/guides/configuration.md +158 -77
- data/docs/guides/entity-management.md +14 -14
- data/docs/guides/extracting-facts.md +28 -28
- data/docs/guides/ingesting-content.md +14 -14
- data/docs/guides/llm-integration.md +40 -32
- data/docs/guides/temporal-queries.md +11 -11
- data/docs/index.md +6 -2
- data/examples/.envrc +4 -0
- data/examples/.gitignore +1 -0
- data/examples/001_configuration.rb +312 -0
- data/examples/{basic_usage.rb → 010_basic_usage.rb} +47 -56
- data/examples/{entity_management.rb → 020_entity_management.rb} +57 -72
- data/examples/{temporal_queries.rb → 030_temporal_queries.rb} +39 -59
- data/examples/040_output_formats.rb +177 -0
- data/examples/{rule_based_extraction.rb → 050_rule_based_extraction.rb} +39 -45
- data/examples/060_fluent_temporal_api.rb +217 -0
- data/examples/070_introspection.rb +252 -0
- data/examples/{hr_system.rb → 080_hr_system.rb} +56 -75
- data/examples/090_ingest_demo.rb +515 -0
- data/examples/100_query_context.rb +668 -0
- data/examples/110_prove_it.rb +204 -0
- data/examples/120_dump_database.rb +358 -0
- data/examples/130_rag_feedback_loop.rb +858 -0
- data/examples/README.md +229 -15
- data/examples/data/lincoln_associates.md +201 -0
- data/examples/data/lincoln_biography.md +66 -0
- data/examples/data/lincoln_cabinet.md +243 -0
- data/examples/data/lincoln_family.md +163 -0
- data/examples/data/lincoln_military.md +241 -0
- data/examples/data/lincoln_todd_family.md +136 -0
- data/examples/ingest_reporter.rb +335 -0
- data/examples/utilities.rb +182 -0
- data/lib/fact_db/config/defaults.yml +254 -0
- data/lib/fact_db/config.rb +94 -35
- data/lib/fact_db/database.rb +98 -8
- data/lib/fact_db/extractors/base.rb +106 -21
- data/lib/fact_db/extractors/llm_extractor.rb +35 -63
- data/lib/fact_db/extractors/manual_extractor.rb +46 -6
- data/lib/fact_db/extractors/rule_based_extractor.rb +136 -25
- data/lib/fact_db/llm/adapter.rb +3 -3
- data/lib/fact_db/models/entity.rb +94 -22
- data/lib/fact_db/models/entity_alias.rb +41 -7
- data/lib/fact_db/models/entity_mention.rb +34 -1
- data/lib/fact_db/models/fact.rb +259 -28
- data/lib/fact_db/models/fact_source.rb +43 -9
- data/lib/fact_db/models/source.rb +113 -0
- data/lib/fact_db/pipeline/extraction_pipeline.rb +35 -35
- data/lib/fact_db/pipeline/resolution_pipeline.rb +5 -5
- data/lib/fact_db/query_result.rb +202 -0
- data/lib/fact_db/resolution/entity_resolver.rb +139 -39
- data/lib/fact_db/resolution/fact_resolver.rb +86 -14
- data/lib/fact_db/services/entity_service.rb +246 -37
- data/lib/fact_db/services/fact_service.rb +254 -17
- data/lib/fact_db/services/source_service.rb +164 -0
- data/lib/fact_db/temporal/query.rb +71 -7
- data/lib/fact_db/temporal/query_builder.rb +69 -0
- data/lib/fact_db/temporal/timeline.rb +102 -11
- data/lib/fact_db/transformers/base.rb +77 -0
- data/lib/fact_db/transformers/cypher_transformer.rb +185 -0
- data/lib/fact_db/transformers/json_transformer.rb +17 -0
- data/lib/fact_db/transformers/raw_transformer.rb +35 -0
- data/lib/fact_db/transformers/text_transformer.rb +114 -0
- data/lib/fact_db/transformers/triple_transformer.rb +138 -0
- data/lib/fact_db/validation/alias_filter.rb +185 -0
- data/lib/fact_db/version.rb +1 -1
- data/lib/fact_db.rb +281 -30
- data/mkdocs.yml +2 -2
- metadata +60 -16
- data/db/migrate/002_create_contents.rb +0 -44
- data/lib/fact_db/models/content.rb +0 -62
- data/lib/fact_db/services/content_service.rb +0 -93
|
@@ -2,34 +2,73 @@
|
|
|
2
2
|
|
|
3
3
|
module FactDb
|
|
4
4
|
module Extractors
|
|
5
|
+
# Abstract base class for fact extractors
|
|
6
|
+
#
|
|
7
|
+
# Provides common interface and helper methods for extracting facts and entities
|
|
8
|
+
# from text. Subclasses must implement #extract and #extract_entities.
|
|
9
|
+
#
|
|
10
|
+
# @abstract Subclass and override {#extract} and {#extract_entities} to implement.
|
|
11
|
+
#
|
|
12
|
+
# @example Create a custom extractor
|
|
13
|
+
# class MyExtractor < FactDb::Extractors::Base
|
|
14
|
+
# def extract(text, context = {})
|
|
15
|
+
# # Implementation
|
|
16
|
+
# end
|
|
17
|
+
#
|
|
18
|
+
# def extract_entities(text)
|
|
19
|
+
# # Implementation
|
|
20
|
+
# end
|
|
21
|
+
# end
|
|
22
|
+
#
|
|
5
23
|
class Base
|
|
24
|
+
# @return [FactDb::Config] the configuration object
|
|
6
25
|
attr_reader :config
|
|
7
26
|
|
|
27
|
+
# Initializes a new extractor
|
|
28
|
+
#
|
|
29
|
+
# @param config [FactDb::Config] configuration object (defaults to FactDb.config)
|
|
8
30
|
def initialize(config = FactDb.config)
|
|
9
31
|
@config = config
|
|
10
32
|
end
|
|
11
33
|
|
|
12
|
-
#
|
|
13
|
-
#
|
|
14
|
-
# @
|
|
15
|
-
# @
|
|
34
|
+
# Extracts facts from text
|
|
35
|
+
#
|
|
36
|
+
# @abstract Subclass and override this method
|
|
37
|
+
# @param text [String] raw text to extract from
|
|
38
|
+
# @param context [Hash] additional context (captured_at, source_uri, etc.)
|
|
39
|
+
# @return [Array<Hash>] array of fact data hashes
|
|
40
|
+
# @raise [NotImplementedError] if not implemented by subclass
|
|
16
41
|
def extract(text, context = {})
|
|
17
42
|
raise NotImplementedError, "#{self.class} must implement #extract"
|
|
18
43
|
end
|
|
19
44
|
|
|
20
|
-
#
|
|
21
|
-
#
|
|
22
|
-
# @
|
|
45
|
+
# Extracts entities from text
|
|
46
|
+
#
|
|
47
|
+
# @abstract Subclass and override this method
|
|
48
|
+
# @param text [String] raw text to extract from
|
|
49
|
+
# @return [Array<Hash>] array of entity hashes with :name, :kind, :aliases
|
|
50
|
+
# @raise [NotImplementedError] if not implemented by subclass
|
|
23
51
|
def extract_entities(text)
|
|
24
52
|
raise NotImplementedError, "#{self.class} must implement #extract_entities"
|
|
25
53
|
end
|
|
26
54
|
|
|
27
|
-
#
|
|
55
|
+
# Returns the extraction method name derived from class name
|
|
56
|
+
#
|
|
57
|
+
# @return [String] method name (e.g., "manual", "llm", "rule_based")
|
|
28
58
|
def extraction_method
|
|
29
59
|
self.class.name.split("::").last.sub("Extractor", "").underscore
|
|
30
60
|
end
|
|
31
61
|
|
|
32
62
|
class << self
|
|
63
|
+
# Factory method to create an extractor by type
|
|
64
|
+
#
|
|
65
|
+
# @param type [Symbol, String] extractor type (:manual, :llm, :rule_based)
|
|
66
|
+
# @param config [FactDb::Config] configuration object
|
|
67
|
+
# @return [Base] an extractor instance
|
|
68
|
+
# @raise [ArgumentError] if type is unknown
|
|
69
|
+
#
|
|
70
|
+
# @example
|
|
71
|
+
# extractor = FactDb::Extractors::Base.for(:llm)
|
|
33
72
|
def for(type, config = FactDb.config)
|
|
34
73
|
case type.to_sym
|
|
35
74
|
when :manual
|
|
@@ -43,6 +82,9 @@ module FactDb
|
|
|
43
82
|
end
|
|
44
83
|
end
|
|
45
84
|
|
|
85
|
+
# Returns list of available extractor types
|
|
86
|
+
#
|
|
87
|
+
# @return [Array<Symbol>] available extractor type symbols
|
|
46
88
|
def available_types
|
|
47
89
|
%i[manual llm rule_based]
|
|
48
90
|
end
|
|
@@ -50,7 +92,12 @@ module FactDb
|
|
|
50
92
|
|
|
51
93
|
protected
|
|
52
94
|
|
|
53
|
-
#
|
|
95
|
+
# Parses a date string, returning nil if invalid
|
|
96
|
+
#
|
|
97
|
+
# Supports natural language parsing via Chronic if available.
|
|
98
|
+
#
|
|
99
|
+
# @param date_str [String, nil] date string to parse
|
|
100
|
+
# @return [Date, nil] parsed date or nil if invalid
|
|
54
101
|
def parse_date(date_str)
|
|
55
102
|
return nil if date_str.nil? || date_str.to_s.empty?
|
|
56
103
|
|
|
@@ -65,7 +112,12 @@ module FactDb
|
|
|
65
112
|
nil
|
|
66
113
|
end
|
|
67
114
|
|
|
68
|
-
#
|
|
115
|
+
# Parses a timestamp string, returning nil if invalid
|
|
116
|
+
#
|
|
117
|
+
# Supports natural language parsing via Chronic if available.
|
|
118
|
+
#
|
|
119
|
+
# @param timestamp_str [String, nil] timestamp string to parse
|
|
120
|
+
# @return [Time, nil] parsed time or nil if invalid
|
|
69
121
|
def parse_timestamp(timestamp_str)
|
|
70
122
|
return nil if timestamp_str.nil? || timestamp_str.to_s.empty?
|
|
71
123
|
|
|
@@ -80,7 +132,15 @@ module FactDb
|
|
|
80
132
|
nil
|
|
81
133
|
end
|
|
82
134
|
|
|
83
|
-
#
|
|
135
|
+
# Builds a standardized fact hash
|
|
136
|
+
#
|
|
137
|
+
# @param text [String] the fact text
|
|
138
|
+
# @param valid_at [Date, Time] when the fact became valid
|
|
139
|
+
# @param invalid_at [Date, Time, nil] when the fact became invalid
|
|
140
|
+
# @param mentions [Array<Hash>] entity mentions
|
|
141
|
+
# @param confidence [Float] confidence score (0.0 to 1.0)
|
|
142
|
+
# @param metadata [Hash] additional metadata
|
|
143
|
+
# @return [Hash] standardized fact hash for persistence
|
|
84
144
|
def build_fact(text:, valid_at:, invalid_at: nil, mentions: [], confidence: 1.0, metadata: {})
|
|
85
145
|
{
|
|
86
146
|
text: text.strip,
|
|
@@ -93,23 +153,48 @@ module FactDb
|
|
|
93
153
|
}
|
|
94
154
|
end
|
|
95
155
|
|
|
96
|
-
#
|
|
97
|
-
|
|
156
|
+
# Builds a standardized entity hash
|
|
157
|
+
#
|
|
158
|
+
# Automatically filters aliases through AliasFilter.
|
|
159
|
+
#
|
|
160
|
+
# @param name [String] the entity name
|
|
161
|
+
# @param kind [String, Symbol] entity kind (person, organization, etc.)
|
|
162
|
+
# @param aliases [Array<String>] alternative names
|
|
163
|
+
# @param attributes [Hash] additional attributes
|
|
164
|
+
# @return [Hash] standardized entity hash
|
|
165
|
+
def build_entity(name:, kind:, aliases: [], attributes: {})
|
|
166
|
+
canonical_name = name.strip
|
|
167
|
+
filtered_aliases = Validation::AliasFilter.filter(aliases, name: canonical_name)
|
|
168
|
+
|
|
98
169
|
{
|
|
99
|
-
name:
|
|
100
|
-
|
|
101
|
-
aliases:
|
|
170
|
+
name: canonical_name,
|
|
171
|
+
kind: kind.to_s,
|
|
172
|
+
aliases: filtered_aliases,
|
|
102
173
|
attributes: attributes
|
|
103
174
|
}
|
|
104
175
|
end
|
|
105
176
|
|
|
106
|
-
#
|
|
107
|
-
|
|
177
|
+
# Builds a standardized entity mention hash
|
|
178
|
+
#
|
|
179
|
+
# Automatically filters aliases through AliasFilter.
|
|
180
|
+
#
|
|
181
|
+
# @param name [String] the entity name
|
|
182
|
+
# @param kind [String, Symbol] entity kind
|
|
183
|
+
# @param role [String, Symbol, nil] mention role (subject, object, etc.)
|
|
184
|
+
# @param confidence [Float] confidence score (0.0 to 1.0)
|
|
185
|
+
# @param aliases [Array<String>] alternative names
|
|
186
|
+
# @return [Hash] standardized mention hash
|
|
187
|
+
def build_mention(name:, kind:, role: nil, confidence: 1.0, aliases: [])
|
|
188
|
+
canonical_name = name.strip
|
|
189
|
+
raw_aliases = Array(aliases).map { |a| a.to_s.strip }.reject(&:empty?)
|
|
190
|
+
filtered_aliases = Validation::AliasFilter.filter(raw_aliases, name: canonical_name)
|
|
191
|
+
|
|
108
192
|
{
|
|
109
|
-
name:
|
|
110
|
-
|
|
193
|
+
name: canonical_name,
|
|
194
|
+
kind: kind.to_s,
|
|
111
195
|
role: role&.to_s,
|
|
112
|
-
confidence: confidence
|
|
196
|
+
confidence: confidence,
|
|
197
|
+
aliases: filtered_aliases
|
|
113
198
|
}
|
|
114
199
|
end
|
|
115
200
|
end
|
|
@@ -4,84 +4,55 @@ require "json"
|
|
|
4
4
|
|
|
5
5
|
module FactDb
|
|
6
6
|
module Extractors
|
|
7
|
+
# LLM-based fact extractor using language models
|
|
8
|
+
#
|
|
9
|
+
# Uses a configured LLM client to extract atomic facts and entities from
|
|
10
|
+
# unstructured text. Parses JSON responses from the LLM and builds
|
|
11
|
+
# standardized fact/entity hashes.
|
|
12
|
+
#
|
|
13
|
+
# @example Extract facts using LLM
|
|
14
|
+
# FactDb.configure { |c| c.llm_client = MyLLMClient.new }
|
|
15
|
+
# extractor = LLMExtractor.new
|
|
16
|
+
# facts = extractor.extract("Paula joined Microsoft on January 10, 2024...")
|
|
17
|
+
#
|
|
7
18
|
class LLMExtractor < Base
|
|
8
|
-
|
|
9
|
-
|
|
10
|
-
|
|
11
|
-
|
|
12
|
-
|
|
13
|
-
|
|
14
|
-
|
|
15
|
-
|
|
16
|
-
|
|
17
|
-
|
|
18
|
-
|
|
19
|
-
Return as a JSON array with this structure:
|
|
20
|
-
[
|
|
21
|
-
{
|
|
22
|
-
"text": "Paula works at Microsoft as Principal Engineer",
|
|
23
|
-
"valid_at": "2024-01-10",
|
|
24
|
-
"invalid_at": null,
|
|
25
|
-
"confidence": 0.95,
|
|
26
|
-
"mentions": [
|
|
27
|
-
{"name": "Paula", "type": "person", "role": "subject"},
|
|
28
|
-
{"name": "Microsoft", "type": "organization", "role": "object"}
|
|
29
|
-
]
|
|
30
|
-
}
|
|
31
|
-
]
|
|
32
|
-
|
|
33
|
-
Rules:
|
|
34
|
-
- Extract only factual assertions, not opinions or speculation
|
|
35
|
-
- Use ISO 8601 date format (YYYY-MM-DD) when possible
|
|
36
|
-
- Set invalid_at to null if the fact is still true or unknown
|
|
37
|
-
- Set valid_at to null if the timing is not mentioned
|
|
38
|
-
- Entity types: person, organization, place, product, event, concept
|
|
39
|
-
- Roles: subject, object, location, temporal, instrument, beneficiary
|
|
40
|
-
|
|
41
|
-
Return only valid JSON, no additional text.
|
|
42
|
-
PROMPT
|
|
43
|
-
|
|
44
|
-
ENTITY_EXTRACTION_PROMPT = <<~PROMPT
|
|
45
|
-
Extract all named entities from the following text.
|
|
46
|
-
For each entity:
|
|
47
|
-
1. Identify the canonical name
|
|
48
|
-
2. Classify the type (person, organization, place, product, event, concept)
|
|
49
|
-
3. List any aliases or alternative names mentioned
|
|
50
|
-
|
|
51
|
-
Text:
|
|
52
|
-
%<text>s
|
|
53
|
-
|
|
54
|
-
Return as a JSON array:
|
|
55
|
-
[
|
|
56
|
-
{
|
|
57
|
-
"name": "Paula Chen",
|
|
58
|
-
"type": "person",
|
|
59
|
-
"aliases": ["Paula", "P. Chen"]
|
|
60
|
-
}
|
|
61
|
-
]
|
|
62
|
-
|
|
63
|
-
Return only valid JSON, no additional text.
|
|
64
|
-
PROMPT
|
|
65
|
-
|
|
19
|
+
# Extracts atomic facts from text using the configured LLM
|
|
20
|
+
#
|
|
21
|
+
# Prompts the LLM to identify factual assertions, temporal information,
|
|
22
|
+
# entity mentions with roles, and confidence scores.
|
|
23
|
+
#
|
|
24
|
+
# @param text [String] raw text to extract from
|
|
25
|
+
# @param context [Hash] additional context
|
|
26
|
+
# @option context [Date, Time] :captured_at default timestamp for facts
|
|
27
|
+
# @return [Array<Hash>] array of fact hashes
|
|
28
|
+
# @raise [ConfigurationError] if no LLM client is configured
|
|
66
29
|
def extract(text, context = {})
|
|
67
30
|
return [] if text.nil? || text.strip.empty?
|
|
68
31
|
|
|
69
32
|
client = config.llm_client
|
|
70
33
|
raise ConfigurationError, "LLM client not configured" unless client
|
|
71
34
|
|
|
72
|
-
prompt = format(
|
|
35
|
+
prompt = format(config.prompts.fact_extraction, text: text)
|
|
73
36
|
response = call_llm(client, prompt)
|
|
74
37
|
|
|
75
38
|
parse_fact_response(response, context)
|
|
76
39
|
end
|
|
77
40
|
|
|
41
|
+
# Extracts entities from text using the configured LLM
|
|
42
|
+
#
|
|
43
|
+
# Prompts the LLM to identify named entities, classify their types,
|
|
44
|
+
# and list any aliases or alternative names.
|
|
45
|
+
#
|
|
46
|
+
# @param text [String] raw text to extract from
|
|
47
|
+
# @return [Array<Hash>] array of entity hashes with :name, :kind, :aliases
|
|
48
|
+
# @raise [ConfigurationError] if no LLM client is configured
|
|
78
49
|
def extract_entities(text)
|
|
79
50
|
return [] if text.nil? || text.strip.empty?
|
|
80
51
|
|
|
81
52
|
client = config.llm_client
|
|
82
53
|
raise ConfigurationError, "LLM client not configured" unless client
|
|
83
54
|
|
|
84
|
-
prompt = format(
|
|
55
|
+
prompt = format(config.prompts.entity_extraction, text: text)
|
|
85
56
|
response = call_llm(client, prompt)
|
|
86
57
|
|
|
87
58
|
parse_entity_response(response)
|
|
@@ -135,7 +106,7 @@ module FactDb
|
|
|
135
106
|
parsed.map do |entity_data|
|
|
136
107
|
build_entity(
|
|
137
108
|
name: entity_data["name"],
|
|
138
|
-
|
|
109
|
+
kind: entity_data["type"] || "concept",
|
|
139
110
|
aliases: entity_data["aliases"] || [],
|
|
140
111
|
attributes: entity_data["attributes"] || {}
|
|
141
112
|
)
|
|
@@ -151,9 +122,10 @@ module FactDb
|
|
|
151
122
|
mentions_data.map do |mention|
|
|
152
123
|
build_mention(
|
|
153
124
|
name: mention["name"],
|
|
154
|
-
|
|
125
|
+
kind: mention["type"] || "concept",
|
|
155
126
|
role: mention["role"],
|
|
156
|
-
confidence: mention["confidence"]&.to_f || 1.0
|
|
127
|
+
confidence: mention["confidence"]&.to_f || 1.0,
|
|
128
|
+
aliases: mention["aliases"] || []
|
|
157
129
|
)
|
|
158
130
|
end
|
|
159
131
|
end
|
|
@@ -2,10 +2,29 @@
|
|
|
2
2
|
|
|
3
3
|
module FactDb
|
|
4
4
|
module Extractors
|
|
5
|
+
# Manual fact extractor for API-driven fact creation
|
|
6
|
+
#
|
|
7
|
+
# Passes through user-provided text as a single fact without any
|
|
8
|
+
# automated extraction. Used when the user provides fact text and
|
|
9
|
+
# metadata directly via the API.
|
|
10
|
+
#
|
|
11
|
+
# @example Extract a manual fact
|
|
12
|
+
# extractor = ManualExtractor.new
|
|
13
|
+
# facts = extractor.extract("John works at Acme", valid_at: Date.today)
|
|
14
|
+
#
|
|
5
15
|
class ManualExtractor < Base
|
|
6
|
-
#
|
|
7
|
-
#
|
|
8
|
-
#
|
|
16
|
+
# Extracts a single fact from the provided text
|
|
17
|
+
#
|
|
18
|
+
# Returns the text as-is without parsing. All metadata comes from context.
|
|
19
|
+
#
|
|
20
|
+
# @param text [String] the fact text
|
|
21
|
+
# @param context [Hash] fact metadata
|
|
22
|
+
# @option context [Date, Time] :valid_at when the fact became valid
|
|
23
|
+
# @option context [Date, Time] :invalid_at when the fact became invalid
|
|
24
|
+
# @option context [Array<Hash>] :mentions entity mentions
|
|
25
|
+
# @option context [Float] :confidence confidence score
|
|
26
|
+
# @option context [Hash] :metadata additional metadata
|
|
27
|
+
# @return [Array<Hash>] array with single fact hash, or empty if text is blank
|
|
9
28
|
def extract(text, context = {})
|
|
10
29
|
return [] if text.nil? || text.strip.empty?
|
|
11
30
|
|
|
@@ -23,12 +42,25 @@ module FactDb
|
|
|
23
42
|
]
|
|
24
43
|
end
|
|
25
44
|
|
|
26
|
-
#
|
|
45
|
+
# Returns empty array since manual extraction expects entities to be provided
|
|
46
|
+
#
|
|
47
|
+
# @param text [String] ignored
|
|
48
|
+
# @return [Array] empty array
|
|
27
49
|
def extract_entities(text)
|
|
28
50
|
[]
|
|
29
51
|
end
|
|
30
52
|
|
|
31
|
-
#
|
|
53
|
+
# Creates a single fact with full control over all attributes
|
|
54
|
+
#
|
|
55
|
+
# Convenience method that wraps #extract with named parameters.
|
|
56
|
+
#
|
|
57
|
+
# @param text [String] the fact text
|
|
58
|
+
# @param valid_at [Date, Time] when the fact became valid
|
|
59
|
+
# @param invalid_at [Date, Time, nil] when the fact became invalid
|
|
60
|
+
# @param mentions [Array<Hash>] entity mentions
|
|
61
|
+
# @param confidence [Float] confidence score (0.0 to 1.0)
|
|
62
|
+
# @param metadata [Hash] additional metadata
|
|
63
|
+
# @return [Hash] the fact hash
|
|
32
64
|
def create_fact(text:, valid_at:, invalid_at: nil, mentions: [], confidence: 1.0, metadata: {})
|
|
33
65
|
extract(text, {
|
|
34
66
|
valid_at: valid_at,
|
|
@@ -39,7 +71,15 @@ module FactDb
|
|
|
39
71
|
}).first
|
|
40
72
|
end
|
|
41
73
|
|
|
42
|
-
#
|
|
74
|
+
# Creates an entity hash
|
|
75
|
+
#
|
|
76
|
+
# Convenience method for building entity data manually.
|
|
77
|
+
#
|
|
78
|
+
# @param name [String] the entity name
|
|
79
|
+
# @param type [String, Symbol] entity kind (person, organization, etc.)
|
|
80
|
+
# @param aliases [Array<String>] alternative names
|
|
81
|
+
# @param attributes [Hash] additional attributes
|
|
82
|
+
# @return [Hash] the entity hash
|
|
43
83
|
def create_entity(name:, type:, aliases: [], attributes: {})
|
|
44
84
|
build_entity(
|
|
45
85
|
name: name,
|
|
@@ -2,8 +2,18 @@
|
|
|
2
2
|
|
|
3
3
|
module FactDb
|
|
4
4
|
module Extractors
|
|
5
|
+
# Rule-based fact extractor using regex patterns
|
|
6
|
+
#
|
|
7
|
+
# Extracts facts from text using predefined regex patterns for common
|
|
8
|
+
# fact types like employment, relationships, and locations. Does not
|
|
9
|
+
# require an LLM but is limited to recognized patterns.
|
|
10
|
+
#
|
|
11
|
+
# @example Extract facts using patterns
|
|
12
|
+
# extractor = RuleBasedExtractor.new
|
|
13
|
+
# facts = extractor.extract("Paula works at Microsoft in Seattle")
|
|
14
|
+
#
|
|
5
15
|
class RuleBasedExtractor < Base
|
|
6
|
-
#
|
|
16
|
+
# @return [Array<Regexp>] patterns for extracting start dates
|
|
7
17
|
DATE_PATTERNS = [
|
|
8
18
|
# "on January 10, 2024"
|
|
9
19
|
/(?:on|since|from|as of|starting)\s+(\w+\s+\d{1,2},?\s+\d{4})/i,
|
|
@@ -15,40 +25,50 @@ module FactDb
|
|
|
15
25
|
/(?:in|during)\s+(\d{4})\b/i
|
|
16
26
|
].freeze
|
|
17
27
|
|
|
28
|
+
# @return [Array<Regexp>] patterns for extracting end dates
|
|
18
29
|
END_DATE_PATTERNS = [
|
|
19
30
|
# "until January 10, 2024"
|
|
20
31
|
/(?:until|through|to|ended|left)\s+(\w+\s+\d{1,2},?\s+\d{4})/i,
|
|
21
32
|
/(?:until|through|to|ended|left)\s+(\d{4}-\d{2}-\d{2})/i
|
|
22
33
|
].freeze
|
|
23
34
|
|
|
24
|
-
#
|
|
35
|
+
# @return [Array<Regexp>] patterns for employment facts
|
|
25
36
|
EMPLOYMENT_PATTERNS = [
|
|
26
37
|
# "Paula works at Microsoft"
|
|
27
|
-
/(\b[A-Z][a-z]+(
|
|
38
|
+
/(\b[A-Z][a-z]+(?:[ ]+[A-Z][a-z]+)*)\b[ ]+(?:works?|worked|is working)[ ]+(?:at|for)[ ]+(\b[A-Z][A-Za-z]+(?:[ ]+[A-Z][A-Za-z]+)*)\b/,
|
|
28
39
|
# "Paula joined Microsoft"
|
|
29
|
-
/(\b[A-Z][a-z]+(
|
|
40
|
+
/(\b[A-Z][a-z]+(?:[ ]+[A-Z][a-z]+)*)\b[ ]+(?:joined|started at|was hired by)[ ]+(\b[A-Z][A-Za-z]+(?:[ ]+[A-Z][A-Za-z]+)*)\b/,
|
|
30
41
|
# "Paula left Microsoft"
|
|
31
|
-
/(\b[A-Z][a-z]+(
|
|
42
|
+
/(\b[A-Z][a-z]+(?:[ ]+[A-Z][a-z]+)*)\b[ ]+(?:left|departed|resigned from|was fired from)[ ]+(\b[A-Z][A-Za-z]+(?:[ ]+[A-Z][A-Za-z]+)*)\b/,
|
|
32
43
|
# "Paula is a Principal Engineer at Microsoft"
|
|
33
|
-
/(\b[A-Z][a-z]+(
|
|
44
|
+
/(\b[A-Z][a-z]+(?:[ ]+[A-Z][a-z]+)*)\b[ ]+(?:is|was|became)[ ]+(?:a[ ]+)?([A-Z][A-Za-z]+(?:[ ]+[A-Z][A-Za-z]+)*)[ ]+at[ ]+(\b[A-Z][A-Za-z]+(?:[ ]+[A-Z][A-Za-z]+)*)\b/
|
|
34
45
|
].freeze
|
|
35
46
|
|
|
36
|
-
#
|
|
47
|
+
# @return [Array<Regexp>] patterns for relationship facts
|
|
37
48
|
RELATIONSHIP_PATTERNS = [
|
|
38
49
|
# "Paula is married to John"
|
|
39
|
-
/(\b[A-Z][a-z]+(
|
|
50
|
+
/(\b[A-Z][a-z]+(?:[ ]+[A-Z][a-z]+)*)\b[ ]+(?:is|was)[ ]+(?:married to|engaged to|dating)[ ]+(\b[A-Z][a-z]+(?:[ ]+[A-Z][a-z]+)*)\b/,
|
|
40
51
|
# "Paula is the CEO of Microsoft"
|
|
41
|
-
/(\b[A-Z][a-z]+(
|
|
52
|
+
/(\b[A-Z][a-z]+(?:[ ]+[A-Z][a-z]+)*)\b[ ]+(?:is|was)[ ]+(?:the[ ]+)?(\w+(?:[ ]+\w+)*)[ ]+of[ ]+(\b[A-Z][A-Za-z]+(?:[ ]+[A-Z][A-Za-z]+)*)\b/
|
|
42
53
|
].freeze
|
|
43
54
|
|
|
44
|
-
#
|
|
55
|
+
# @return [Array<Regexp>] patterns for location facts
|
|
45
56
|
LOCATION_PATTERNS = [
|
|
46
|
-
# "Paula lives in Seattle"
|
|
47
|
-
/(\b[A-Z][a-z]+(
|
|
48
|
-
# "Microsoft is headquartered in Redmond"
|
|
49
|
-
/(\b[A-Z][A-Za-z]+(
|
|
57
|
+
# "Paula lives in Seattle" or "Bob lives in New York City"
|
|
58
|
+
/(\b[A-Z][a-z]+(?:[ ]+[A-Z][a-z]+)*)\b[ ]+(?:lives?|lived|is based|was based|relocated|moved)[ ]+(?:in|to)[ ]+(\b[A-Z][A-Za-z]+(?:[ ]+[A-Z][A-Za-z]+)*(?:,[ ]+[A-Z]{2})?)\b/,
|
|
59
|
+
# "Microsoft is headquartered in Redmond" or "in Seattle, Washington"
|
|
60
|
+
/(\b[A-Z][A-Za-z]+(?:[ ]+[A-Z][A-Za-z]+)*)\b[ ]+(?:is|was)[ ]+(?:headquartered|located|based)[ ]+in[ ]+(\b[A-Z][A-Za-z]+(?:[ ]+[A-Z][A-Za-z]+)*(?:,[ ]+[A-Z][A-Za-z]+)?)\b/
|
|
50
61
|
].freeze
|
|
51
62
|
|
|
63
|
+
# Extracts facts from text using regex patterns
|
|
64
|
+
#
|
|
65
|
+
# Applies employment, relationship, and location patterns to identify
|
|
66
|
+
# facts, with associated entity mentions and temporal information.
|
|
67
|
+
#
|
|
68
|
+
# @param text [String] raw text to extract from
|
|
69
|
+
# @param context [Hash] additional context
|
|
70
|
+
# @option context [Date, Time] :captured_at default timestamp for facts
|
|
71
|
+
# @return [Array<Hash>] array of fact hashes, deduplicated by text
|
|
52
72
|
def extract(text, context = {})
|
|
53
73
|
return [] if text.nil? || text.strip.empty?
|
|
54
74
|
|
|
@@ -66,23 +86,35 @@ module FactDb
|
|
|
66
86
|
facts.uniq { |f| f[:text] }
|
|
67
87
|
end
|
|
68
88
|
|
|
89
|
+
# Extracts entities from text using regex patterns
|
|
90
|
+
#
|
|
91
|
+
# Identifies person names, organization names, and locations using
|
|
92
|
+
# pattern matching. Filters out common words, job titles, and known phrases.
|
|
93
|
+
#
|
|
94
|
+
# @param text [String] raw text to extract from
|
|
95
|
+
# @return [Array<Hash>] array of entity hashes with :name and :kind
|
|
69
96
|
def extract_entities(text)
|
|
70
97
|
return [] if text.nil? || text.strip.empty?
|
|
71
98
|
|
|
72
99
|
entities = []
|
|
73
100
|
|
|
74
|
-
# Extract person names (
|
|
75
|
-
|
|
101
|
+
# Extract person names (capitalized word sequences on same line)
|
|
102
|
+
# Use [ ]+ instead of \s+ to avoid matching across newlines
|
|
103
|
+
text.scan(/\b([A-Z][a-z]+(?:[ ]+[A-Z][a-z]+)+)\b/).flatten.uniq.each do |name|
|
|
76
104
|
next if common_word?(name)
|
|
105
|
+
next if job_title?(name)
|
|
106
|
+
next if common_phrase?(name)
|
|
107
|
+
next if known_place?(name)
|
|
108
|
+
next if organization_indicator?(name)
|
|
77
109
|
|
|
78
|
-
entities << build_entity(name: name,
|
|
110
|
+
entities << build_entity(name: name, kind: "person")
|
|
79
111
|
end
|
|
80
112
|
|
|
81
113
|
# Extract organization names (from employment patterns)
|
|
82
114
|
EMPLOYMENT_PATTERNS.each do |pattern|
|
|
83
115
|
text.scan(pattern).each do |match|
|
|
84
116
|
org_name = match.last
|
|
85
|
-
entities << build_entity(name: org_name,
|
|
117
|
+
entities << build_entity(name: org_name, kind: "organization") unless common_word?(org_name)
|
|
86
118
|
end
|
|
87
119
|
end
|
|
88
120
|
|
|
@@ -90,7 +122,7 @@ module FactDb
|
|
|
90
122
|
LOCATION_PATTERNS.each do |pattern|
|
|
91
123
|
text.scan(pattern).each do |match|
|
|
92
124
|
location = match.last
|
|
93
|
-
entities << build_entity(name: location,
|
|
125
|
+
entities << build_entity(name: location, kind: "place") unless common_word?(location)
|
|
94
126
|
end
|
|
95
127
|
end
|
|
96
128
|
|
|
@@ -116,13 +148,13 @@ module FactDb
|
|
|
116
148
|
invalid_at = is_termination ? (extract_end_date(text) || default_date) : nil
|
|
117
149
|
|
|
118
150
|
mentions = [
|
|
119
|
-
build_mention(name: person,
|
|
120
|
-
build_mention(name: org,
|
|
151
|
+
build_mention(name: person, kind: "person", role: "subject"),
|
|
152
|
+
build_mention(name: org, kind: "organization", role: "object")
|
|
121
153
|
]
|
|
122
154
|
|
|
123
155
|
# Add role if present
|
|
124
156
|
if rest.length > 1
|
|
125
|
-
mentions << build_mention(name: rest[0],
|
|
157
|
+
mentions << build_mention(name: rest[0], kind: "concept", role: "instrument")
|
|
126
158
|
end
|
|
127
159
|
|
|
128
160
|
facts << build_fact(
|
|
@@ -148,7 +180,7 @@ module FactDb
|
|
|
148
180
|
|
|
149
181
|
mentions = match.map.with_index do |name, i|
|
|
150
182
|
role = i.zero? ? "subject" : "object"
|
|
151
|
-
build_mention(name: name,
|
|
183
|
+
build_mention(name: name, kind: "person", role: role)
|
|
152
184
|
end
|
|
153
185
|
|
|
154
186
|
facts << build_fact(
|
|
@@ -177,8 +209,8 @@ module FactDb
|
|
|
177
209
|
entity_type = text.match?(/#{Regexp.escape(entity_name)}\s+(?:lives?|lived)/i) ? "person" : "organization"
|
|
178
210
|
|
|
179
211
|
mentions = [
|
|
180
|
-
build_mention(name: entity_name,
|
|
181
|
-
build_mention(name: location,
|
|
212
|
+
build_mention(name: entity_name, kind: entity_type, role: "subject"),
|
|
213
|
+
build_mention(name: location, kind: "place", role: "location")
|
|
182
214
|
]
|
|
183
215
|
|
|
184
216
|
facts << build_fact(
|
|
@@ -223,6 +255,85 @@ module FactDb
|
|
|
223
255
|
]
|
|
224
256
|
common_words.any? { |w| w.casecmp?(word) }
|
|
225
257
|
end
|
|
258
|
+
|
|
259
|
+
def job_title?(text)
|
|
260
|
+
# Common job title words that indicate this is a role, not a person name
|
|
261
|
+
title_indicators = %w[
|
|
262
|
+
Chief Executive Officer Director Manager Engineer Developer
|
|
263
|
+
President Vice Principal Senior Junior Lead Head
|
|
264
|
+
Analyst Coordinator Administrator Assistant Specialist
|
|
265
|
+
Consultant Architect Designer Technician Supervisor
|
|
266
|
+
CTO CEO CFO COO CMO CIO CPO
|
|
267
|
+
VP SVP EVP
|
|
268
|
+
]
|
|
269
|
+
|
|
270
|
+
words = text.split(/\s+/)
|
|
271
|
+
|
|
272
|
+
# If any word is a title indicator, it's likely a job title
|
|
273
|
+
words.any? { |word| title_indicators.any? { |t| t.casecmp?(word) } }
|
|
274
|
+
end
|
|
275
|
+
|
|
276
|
+
def common_phrase?(text)
|
|
277
|
+
# Common document phrases that are not person names
|
|
278
|
+
phrases = [
|
|
279
|
+
/Team\s+Updates?/i,
|
|
280
|
+
/Action\s+Items?/i,
|
|
281
|
+
/Meeting\s+Notes?/i,
|
|
282
|
+
/Status\s+Meeting/i,
|
|
283
|
+
/Project\s+Status/i,
|
|
284
|
+
/Human\s+Resources?/i,
|
|
285
|
+
/Best\s+Regards?/i,
|
|
286
|
+
/Immediate\s+Release/i,
|
|
287
|
+
/New\s+Leadership/i,
|
|
288
|
+
/Appoints?\s+New/i,
|
|
289
|
+
/Recent\s+\w+/i,
|
|
290
|
+
/Please\s+\w+/i
|
|
291
|
+
]
|
|
292
|
+
|
|
293
|
+
phrases.any? { |pattern| text.match?(pattern) }
|
|
294
|
+
end
|
|
295
|
+
|
|
296
|
+
def known_place?(text)
|
|
297
|
+
# Common city/place names or location indicators
|
|
298
|
+
place_indicators = %w[
|
|
299
|
+
City County State Province District Region
|
|
300
|
+
Beach Park Heights Hills Valley Springs Lake
|
|
301
|
+
Island Harbor Port
|
|
302
|
+
]
|
|
303
|
+
|
|
304
|
+
# Common multi-word US city names
|
|
305
|
+
known_cities = [
|
|
306
|
+
"New York", "Los Angeles", "San Francisco", "San Diego", "San Jose",
|
|
307
|
+
"San Antonio", "Las Vegas", "Salt Lake", "New Orleans", "Fort Worth",
|
|
308
|
+
"Fort Lauderdale", "St Louis", "St Paul", "El Paso", "Santa Fe",
|
|
309
|
+
"Santa Monica", "Palm Beach", "Long Beach", "Virginia Beach"
|
|
310
|
+
]
|
|
311
|
+
|
|
312
|
+
words = text.split(/\s+/)
|
|
313
|
+
|
|
314
|
+
# Check for place indicator words
|
|
315
|
+
return true if words.any? { |word| place_indicators.any? { |p| p.casecmp?(word) } }
|
|
316
|
+
|
|
317
|
+
# Check for known city names
|
|
318
|
+
known_cities.any? { |city| text.casecmp?(city) || text.start_with?("#{city} ") }
|
|
319
|
+
end
|
|
320
|
+
|
|
321
|
+
def organization_indicator?(text)
|
|
322
|
+
# Words that indicate an organization, not a person
|
|
323
|
+
org_indicators = %w[
|
|
324
|
+
Solutions Technologies Systems Services Group
|
|
325
|
+
Partners Associates Consulting Agency
|
|
326
|
+
Industries Enterprises Holdings Ventures
|
|
327
|
+
Foundation Institute University College
|
|
328
|
+
Global International National Regional
|
|
329
|
+
Tech Corp Labs
|
|
330
|
+
]
|
|
331
|
+
|
|
332
|
+
words = text.split(/\s+/)
|
|
333
|
+
|
|
334
|
+
# If any word is an org indicator, it's likely an organization
|
|
335
|
+
words.any? { |word| org_indicators.any? { |o| o.casecmp?(word) } }
|
|
336
|
+
end
|
|
226
337
|
end
|
|
227
338
|
end
|
|
228
339
|
end
|
data/lib/fact_db/llm/adapter.rb
CHANGED
|
@@ -29,9 +29,9 @@ module FactDb
|
|
|
29
29
|
# # llm_api_key: <%= ENV["ANTHROPIC_API_KEY"] %>
|
|
30
30
|
#
|
|
31
31
|
# @example Configure via environment variables
|
|
32
|
-
# #
|
|
33
|
-
# #
|
|
34
|
-
# #
|
|
32
|
+
# # FACT_DB_LLM_PROVIDER=anthropic
|
|
33
|
+
# # FACT_DB_LLM_MODEL=claude-sonnet-4-20250514
|
|
34
|
+
# # FACT_DB_LLM_API_KEY=sk-...
|
|
35
35
|
#
|
|
36
36
|
class Adapter
|
|
37
37
|
attr_reader :model, :provider
|