fact_db 0.0.2 → 0.0.3

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (114) hide show
  1. checksums.yaml +4 -4
  2. data/.envrc +2 -0
  3. data/.yardopts +5 -0
  4. data/CHANGELOG.md +64 -0
  5. data/README.md +107 -6
  6. data/Rakefile +243 -10
  7. data/db/migrate/001_enable_extensions.rb +1 -0
  8. data/db/migrate/002_create_sources.rb +49 -0
  9. data/db/migrate/003_create_entities.rb +27 -15
  10. data/db/migrate/004_create_entity_aliases.rb +20 -7
  11. data/db/migrate/005_create_facts.rb +37 -21
  12. data/db/migrate/006_create_entity_mentions.rb +14 -6
  13. data/db/migrate/007_create_fact_sources.rb +16 -8
  14. data/docs/api/extractors/index.md +5 -5
  15. data/docs/api/extractors/llm.md +17 -17
  16. data/docs/api/extractors/rule-based.md +14 -14
  17. data/docs/api/facts.md +20 -20
  18. data/docs/api/index.md +4 -4
  19. data/docs/api/models/entity.md +21 -21
  20. data/docs/api/models/fact.md +15 -15
  21. data/docs/api/models/index.md +7 -7
  22. data/docs/api/models/{content.md → source.md} +29 -29
  23. data/docs/api/pipeline/extraction.md +25 -25
  24. data/docs/api/pipeline/index.md +1 -1
  25. data/docs/api/pipeline/resolution.md +4 -4
  26. data/docs/api/services/entity-service.md +20 -20
  27. data/docs/api/services/fact-service.md +12 -12
  28. data/docs/api/services/index.md +5 -5
  29. data/docs/api/services/{content-service.md → source-service.md} +27 -27
  30. data/docs/architecture/database-schema.md +46 -46
  31. data/docs/architecture/entity-resolution.md +6 -6
  32. data/docs/architecture/index.md +10 -10
  33. data/docs/architecture/temporal-facts.md +5 -5
  34. data/docs/architecture/three-layer-model.md +17 -17
  35. data/docs/concepts.md +6 -6
  36. data/docs/examples/basic-usage.md +20 -20
  37. data/docs/examples/hr-onboarding.md +17 -17
  38. data/docs/examples/index.md +4 -4
  39. data/docs/examples/news-analysis.md +23 -23
  40. data/docs/getting-started/database-setup.md +28 -20
  41. data/docs/getting-started/index.md +3 -3
  42. data/docs/getting-started/quick-start.md +33 -30
  43. data/docs/guides/batch-processing.md +26 -26
  44. data/docs/guides/configuration.md +158 -77
  45. data/docs/guides/entity-management.md +14 -14
  46. data/docs/guides/extracting-facts.md +28 -28
  47. data/docs/guides/ingesting-content.md +14 -14
  48. data/docs/guides/llm-integration.md +40 -32
  49. data/docs/guides/temporal-queries.md +11 -11
  50. data/docs/index.md +6 -2
  51. data/examples/.envrc +4 -0
  52. data/examples/.gitignore +1 -0
  53. data/examples/001_configuration.rb +312 -0
  54. data/examples/{basic_usage.rb → 010_basic_usage.rb} +47 -56
  55. data/examples/{entity_management.rb → 020_entity_management.rb} +57 -72
  56. data/examples/{temporal_queries.rb → 030_temporal_queries.rb} +39 -59
  57. data/examples/040_output_formats.rb +177 -0
  58. data/examples/{rule_based_extraction.rb → 050_rule_based_extraction.rb} +39 -45
  59. data/examples/060_fluent_temporal_api.rb +217 -0
  60. data/examples/070_introspection.rb +252 -0
  61. data/examples/{hr_system.rb → 080_hr_system.rb} +56 -75
  62. data/examples/090_ingest_demo.rb +515 -0
  63. data/examples/100_query_context.rb +668 -0
  64. data/examples/110_prove_it.rb +204 -0
  65. data/examples/120_dump_database.rb +358 -0
  66. data/examples/130_rag_feedback_loop.rb +858 -0
  67. data/examples/README.md +229 -15
  68. data/examples/data/lincoln_associates.md +201 -0
  69. data/examples/data/lincoln_biography.md +66 -0
  70. data/examples/data/lincoln_cabinet.md +243 -0
  71. data/examples/data/lincoln_family.md +163 -0
  72. data/examples/data/lincoln_military.md +241 -0
  73. data/examples/data/lincoln_todd_family.md +136 -0
  74. data/examples/ingest_reporter.rb +335 -0
  75. data/examples/utilities.rb +182 -0
  76. data/lib/fact_db/config/defaults.yml +254 -0
  77. data/lib/fact_db/config.rb +94 -35
  78. data/lib/fact_db/database.rb +98 -8
  79. data/lib/fact_db/extractors/base.rb +106 -21
  80. data/lib/fact_db/extractors/llm_extractor.rb +35 -63
  81. data/lib/fact_db/extractors/manual_extractor.rb +46 -6
  82. data/lib/fact_db/extractors/rule_based_extractor.rb +136 -25
  83. data/lib/fact_db/llm/adapter.rb +3 -3
  84. data/lib/fact_db/models/entity.rb +94 -22
  85. data/lib/fact_db/models/entity_alias.rb +41 -7
  86. data/lib/fact_db/models/entity_mention.rb +34 -1
  87. data/lib/fact_db/models/fact.rb +259 -28
  88. data/lib/fact_db/models/fact_source.rb +43 -9
  89. data/lib/fact_db/models/source.rb +113 -0
  90. data/lib/fact_db/pipeline/extraction_pipeline.rb +35 -35
  91. data/lib/fact_db/pipeline/resolution_pipeline.rb +5 -5
  92. data/lib/fact_db/query_result.rb +202 -0
  93. data/lib/fact_db/resolution/entity_resolver.rb +139 -39
  94. data/lib/fact_db/resolution/fact_resolver.rb +86 -14
  95. data/lib/fact_db/services/entity_service.rb +246 -37
  96. data/lib/fact_db/services/fact_service.rb +254 -17
  97. data/lib/fact_db/services/source_service.rb +164 -0
  98. data/lib/fact_db/temporal/query.rb +71 -7
  99. data/lib/fact_db/temporal/query_builder.rb +69 -0
  100. data/lib/fact_db/temporal/timeline.rb +102 -11
  101. data/lib/fact_db/transformers/base.rb +77 -0
  102. data/lib/fact_db/transformers/cypher_transformer.rb +185 -0
  103. data/lib/fact_db/transformers/json_transformer.rb +17 -0
  104. data/lib/fact_db/transformers/raw_transformer.rb +35 -0
  105. data/lib/fact_db/transformers/text_transformer.rb +114 -0
  106. data/lib/fact_db/transformers/triple_transformer.rb +138 -0
  107. data/lib/fact_db/validation/alias_filter.rb +185 -0
  108. data/lib/fact_db/version.rb +1 -1
  109. data/lib/fact_db.rb +281 -30
  110. data/mkdocs.yml +2 -2
  111. metadata +60 -16
  112. data/db/migrate/002_create_contents.rb +0 -44
  113. data/lib/fact_db/models/content.rb +0 -62
  114. data/lib/fact_db/services/content_service.rb +0 -93
@@ -2,34 +2,73 @@
2
2
 
3
3
  module FactDb
4
4
  module Extractors
5
+ # Abstract base class for fact extractors
6
+ #
7
+ # Provides common interface and helper methods for extracting facts and entities
8
+ # from text. Subclasses must implement #extract and #extract_entities.
9
+ #
10
+ # @abstract Subclass and override {#extract} and {#extract_entities} to implement.
11
+ #
12
+ # @example Create a custom extractor
13
+ # class MyExtractor < FactDb::Extractors::Base
14
+ # def extract(text, context = {})
15
+ # # Implementation
16
+ # end
17
+ #
18
+ # def extract_entities(text)
19
+ # # Implementation
20
+ # end
21
+ # end
22
+ #
5
23
  class Base
24
+ # @return [FactDb::Config] the configuration object
6
25
  attr_reader :config
7
26
 
27
+ # Initializes a new extractor
28
+ #
29
+ # @param config [FactDb::Config] configuration object (defaults to FactDb.config)
8
30
  def initialize(config = FactDb.config)
9
31
  @config = config
10
32
  end
11
33
 
12
- # Extract facts from text
13
- # @param text [String] Raw text to extract from
14
- # @param context [Hash] Additional context (captured_at, source_uri, etc.)
15
- # @return [Array<Hash>] Array of fact data hashes
34
+ # Extracts facts from text
35
+ #
36
+ # @abstract Subclass and override this method
37
+ # @param text [String] raw text to extract from
38
+ # @param context [Hash] additional context (captured_at, source_uri, etc.)
39
+ # @return [Array<Hash>] array of fact data hashes
40
+ # @raise [NotImplementedError] if not implemented by subclass
16
41
  def extract(text, context = {})
17
42
  raise NotImplementedError, "#{self.class} must implement #extract"
18
43
  end
19
44
 
20
- # Extract entities from text
21
- # @param text [String] Raw text to extract from
22
- # @return [Array<Hash>] Array of { name:, type:, aliases: }
45
+ # Extracts entities from text
46
+ #
47
+ # @abstract Subclass and override this method
48
+ # @param text [String] raw text to extract from
49
+ # @return [Array<Hash>] array of entity hashes with :name, :kind, :aliases
50
+ # @raise [NotImplementedError] if not implemented by subclass
23
51
  def extract_entities(text)
24
52
  raise NotImplementedError, "#{self.class} must implement #extract_entities"
25
53
  end
26
54
 
27
- # Get the extraction method name
55
+ # Returns the extraction method name derived from class name
56
+ #
57
+ # @return [String] method name (e.g., "manual", "llm", "rule_based")
28
58
  def extraction_method
29
59
  self.class.name.split("::").last.sub("Extractor", "").underscore
30
60
  end
31
61
 
32
62
  class << self
63
+ # Factory method to create an extractor by type
64
+ #
65
+ # @param type [Symbol, String] extractor type (:manual, :llm, :rule_based)
66
+ # @param config [FactDb::Config] configuration object
67
+ # @return [Base] an extractor instance
68
+ # @raise [ArgumentError] if type is unknown
69
+ #
70
+ # @example
71
+ # extractor = FactDb::Extractors::Base.for(:llm)
33
72
  def for(type, config = FactDb.config)
34
73
  case type.to_sym
35
74
  when :manual
@@ -43,6 +82,9 @@ module FactDb
43
82
  end
44
83
  end
45
84
 
85
+ # Returns list of available extractor types
86
+ #
87
+ # @return [Array<Symbol>] available extractor type symbols
46
88
  def available_types
47
89
  %i[manual llm rule_based]
48
90
  end
@@ -50,7 +92,12 @@ module FactDb
50
92
 
51
93
  protected
52
94
 
53
- # Parse a date string, returning nil if invalid
95
+ # Parses a date string, returning nil if invalid
96
+ #
97
+ # Supports natural language parsing via Chronic if available.
98
+ #
99
+ # @param date_str [String, nil] date string to parse
100
+ # @return [Date, nil] parsed date or nil if invalid
54
101
  def parse_date(date_str)
55
102
  return nil if date_str.nil? || date_str.to_s.empty?
56
103
 
@@ -65,7 +112,12 @@ module FactDb
65
112
  nil
66
113
  end
67
114
 
68
- # Parse a timestamp string, returning nil if invalid
115
+ # Parses a timestamp string, returning nil if invalid
116
+ #
117
+ # Supports natural language parsing via Chronic if available.
118
+ #
119
+ # @param timestamp_str [String, nil] timestamp string to parse
120
+ # @return [Time, nil] parsed time or nil if invalid
69
121
  def parse_timestamp(timestamp_str)
70
122
  return nil if timestamp_str.nil? || timestamp_str.to_s.empty?
71
123
 
@@ -80,7 +132,15 @@ module FactDb
80
132
  nil
81
133
  end
82
134
 
83
- # Build a standardized fact hash
135
+ # Builds a standardized fact hash
136
+ #
137
+ # @param text [String] the fact text
138
+ # @param valid_at [Date, Time] when the fact became valid
139
+ # @param invalid_at [Date, Time, nil] when the fact became invalid
140
+ # @param mentions [Array<Hash>] entity mentions
141
+ # @param confidence [Float] confidence score (0.0 to 1.0)
142
+ # @param metadata [Hash] additional metadata
143
+ # @return [Hash] standardized fact hash for persistence
84
144
  def build_fact(text:, valid_at:, invalid_at: nil, mentions: [], confidence: 1.0, metadata: {})
85
145
  {
86
146
  text: text.strip,
@@ -93,23 +153,48 @@ module FactDb
93
153
  }
94
154
  end
95
155
 
96
- # Build a standardized entity hash
97
- def build_entity(name:, type:, aliases: [], attributes: {})
156
+ # Builds a standardized entity hash
157
+ #
158
+ # Automatically filters aliases through AliasFilter.
159
+ #
160
+ # @param name [String] the entity name
161
+ # @param kind [String, Symbol] entity kind (person, organization, etc.)
162
+ # @param aliases [Array<String>] alternative names
163
+ # @param attributes [Hash] additional attributes
164
+ # @return [Hash] standardized entity hash
165
+ def build_entity(name:, kind:, aliases: [], attributes: {})
166
+ canonical_name = name.strip
167
+ filtered_aliases = Validation::AliasFilter.filter(aliases, name: canonical_name)
168
+
98
169
  {
99
- name: name.strip,
100
- type: type.to_s,
101
- aliases: aliases.map(&:strip),
170
+ name: canonical_name,
171
+ kind: kind.to_s,
172
+ aliases: filtered_aliases,
102
173
  attributes: attributes
103
174
  }
104
175
  end
105
176
 
106
- # Build a standardized mention hash
107
- def build_mention(name:, type:, role: nil, confidence: 1.0)
177
+ # Builds a standardized entity mention hash
178
+ #
179
+ # Automatically filters aliases through AliasFilter.
180
+ #
181
+ # @param name [String] the entity name
182
+ # @param kind [String, Symbol] entity kind
183
+ # @param role [String, Symbol, nil] mention role (subject, object, etc.)
184
+ # @param confidence [Float] confidence score (0.0 to 1.0)
185
+ # @param aliases [Array<String>] alternative names
186
+ # @return [Hash] standardized mention hash
187
+ def build_mention(name:, kind:, role: nil, confidence: 1.0, aliases: [])
188
+ canonical_name = name.strip
189
+ raw_aliases = Array(aliases).map { |a| a.to_s.strip }.reject(&:empty?)
190
+ filtered_aliases = Validation::AliasFilter.filter(raw_aliases, name: canonical_name)
191
+
108
192
  {
109
- name: name.strip,
110
- type: type.to_s,
193
+ name: canonical_name,
194
+ kind: kind.to_s,
111
195
  role: role&.to_s,
112
- confidence: confidence
196
+ confidence: confidence,
197
+ aliases: filtered_aliases
113
198
  }
114
199
  end
115
200
  end
@@ -4,84 +4,55 @@ require "json"
4
4
 
5
5
  module FactDb
6
6
  module Extractors
7
+ # LLM-based fact extractor using language models
8
+ #
9
+ # Uses a configured LLM client to extract atomic facts and entities from
10
+ # unstructured text. Parses JSON responses from the LLM and builds
11
+ # standardized fact/entity hashes.
12
+ #
13
+ # @example Extract facts using LLM
14
+ # FactDb.configure { |c| c.llm_client = MyLLMClient.new }
15
+ # extractor = LLMExtractor.new
16
+ # facts = extractor.extract("Paula joined Microsoft on January 10, 2024...")
17
+ #
7
18
  class LLMExtractor < Base
8
- FACT_EXTRACTION_PROMPT = <<~PROMPT
9
- Extract factual assertions from the following text. For each fact:
10
- 1. State the assertion clearly and concisely
11
- 2. Identify when it became true (valid_at) if mentioned
12
- 3. Identify when it stopped being true (invalid_at) if mentioned
13
- 4. Identify entities mentioned (people, organizations, places, products)
14
- 5. Assign a confidence score (0.0 to 1.0) based on how explicitly stated the fact is
15
-
16
- Text:
17
- %<text>s
18
-
19
- Return as a JSON array with this structure:
20
- [
21
- {
22
- "text": "Paula works at Microsoft as Principal Engineer",
23
- "valid_at": "2024-01-10",
24
- "invalid_at": null,
25
- "confidence": 0.95,
26
- "mentions": [
27
- {"name": "Paula", "type": "person", "role": "subject"},
28
- {"name": "Microsoft", "type": "organization", "role": "object"}
29
- ]
30
- }
31
- ]
32
-
33
- Rules:
34
- - Extract only factual assertions, not opinions or speculation
35
- - Use ISO 8601 date format (YYYY-MM-DD) when possible
36
- - Set invalid_at to null if the fact is still true or unknown
37
- - Set valid_at to null if the timing is not mentioned
38
- - Entity types: person, organization, place, product, event, concept
39
- - Roles: subject, object, location, temporal, instrument, beneficiary
40
-
41
- Return only valid JSON, no additional text.
42
- PROMPT
43
-
44
- ENTITY_EXTRACTION_PROMPT = <<~PROMPT
45
- Extract all named entities from the following text.
46
- For each entity:
47
- 1. Identify the canonical name
48
- 2. Classify the type (person, organization, place, product, event, concept)
49
- 3. List any aliases or alternative names mentioned
50
-
51
- Text:
52
- %<text>s
53
-
54
- Return as a JSON array:
55
- [
56
- {
57
- "name": "Paula Chen",
58
- "type": "person",
59
- "aliases": ["Paula", "P. Chen"]
60
- }
61
- ]
62
-
63
- Return only valid JSON, no additional text.
64
- PROMPT
65
-
19
+ # Extracts atomic facts from text using the configured LLM
20
+ #
21
+ # Prompts the LLM to identify factual assertions, temporal information,
22
+ # entity mentions with roles, and confidence scores.
23
+ #
24
+ # @param text [String] raw text to extract from
25
+ # @param context [Hash] additional context
26
+ # @option context [Date, Time] :captured_at default timestamp for facts
27
+ # @return [Array<Hash>] array of fact hashes
28
+ # @raise [ConfigurationError] if no LLM client is configured
66
29
  def extract(text, context = {})
67
30
  return [] if text.nil? || text.strip.empty?
68
31
 
69
32
  client = config.llm_client
70
33
  raise ConfigurationError, "LLM client not configured" unless client
71
34
 
72
- prompt = format(FACT_EXTRACTION_PROMPT, text: text)
35
+ prompt = format(config.prompts.fact_extraction, text: text)
73
36
  response = call_llm(client, prompt)
74
37
 
75
38
  parse_fact_response(response, context)
76
39
  end
77
40
 
41
+ # Extracts entities from text using the configured LLM
42
+ #
43
+ # Prompts the LLM to identify named entities, classify their types,
44
+ # and list any aliases or alternative names.
45
+ #
46
+ # @param text [String] raw text to extract from
47
+ # @return [Array<Hash>] array of entity hashes with :name, :kind, :aliases
48
+ # @raise [ConfigurationError] if no LLM client is configured
78
49
  def extract_entities(text)
79
50
  return [] if text.nil? || text.strip.empty?
80
51
 
81
52
  client = config.llm_client
82
53
  raise ConfigurationError, "LLM client not configured" unless client
83
54
 
84
- prompt = format(ENTITY_EXTRACTION_PROMPT, text: text)
55
+ prompt = format(config.prompts.entity_extraction, text: text)
85
56
  response = call_llm(client, prompt)
86
57
 
87
58
  parse_entity_response(response)
@@ -135,7 +106,7 @@ module FactDb
135
106
  parsed.map do |entity_data|
136
107
  build_entity(
137
108
  name: entity_data["name"],
138
- type: entity_data["type"] || "concept",
109
+ kind: entity_data["type"] || "concept",
139
110
  aliases: entity_data["aliases"] || [],
140
111
  attributes: entity_data["attributes"] || {}
141
112
  )
@@ -151,9 +122,10 @@ module FactDb
151
122
  mentions_data.map do |mention|
152
123
  build_mention(
153
124
  name: mention["name"],
154
- type: mention["type"] || "concept",
125
+ kind: mention["type"] || "concept",
155
126
  role: mention["role"],
156
- confidence: mention["confidence"]&.to_f || 1.0
127
+ confidence: mention["confidence"]&.to_f || 1.0,
128
+ aliases: mention["aliases"] || []
157
129
  )
158
130
  end
159
131
  end
@@ -2,10 +2,29 @@
2
2
 
3
3
  module FactDb
4
4
  module Extractors
5
+ # Manual fact extractor for API-driven fact creation
6
+ #
7
+ # Passes through user-provided text as a single fact without any
8
+ # automated extraction. Used when the user provides fact text and
9
+ # metadata directly via the API.
10
+ #
11
+ # @example Extract a manual fact
12
+ # extractor = ManualExtractor.new
13
+ # facts = extractor.extract("John works at Acme", valid_at: Date.today)
14
+ #
5
15
  class ManualExtractor < Base
6
- # Manual extraction passes through the text as a single fact
7
- # This is used for API-driven fact creation where the user
8
- # provides the fact text and metadata directly
16
+ # Extracts a single fact from the provided text
17
+ #
18
+ # Returns the text as-is without parsing. All metadata comes from context.
19
+ #
20
+ # @param text [String] the fact text
21
+ # @param context [Hash] fact metadata
22
+ # @option context [Date, Time] :valid_at when the fact became valid
23
+ # @option context [Date, Time] :invalid_at when the fact became invalid
24
+ # @option context [Array<Hash>] :mentions entity mentions
25
+ # @option context [Float] :confidence confidence score
26
+ # @option context [Hash] :metadata additional metadata
27
+ # @return [Array<Hash>] array with single fact hash, or empty if text is blank
9
28
  def extract(text, context = {})
10
29
  return [] if text.nil? || text.strip.empty?
11
30
 
@@ -23,12 +42,25 @@ module FactDb
23
42
  ]
24
43
  end
25
44
 
26
- # Manual extraction expects entities to be provided explicitly
45
+ # Returns empty array since manual extraction expects entities to be provided
46
+ #
47
+ # @param text [String] ignored
48
+ # @return [Array] empty array
27
49
  def extract_entities(text)
28
50
  []
29
51
  end
30
52
 
31
- # Convenience method for creating a single fact with full control
53
+ # Creates a single fact with full control over all attributes
54
+ #
55
+ # Convenience method that wraps #extract with named parameters.
56
+ #
57
+ # @param text [String] the fact text
58
+ # @param valid_at [Date, Time] when the fact became valid
59
+ # @param invalid_at [Date, Time, nil] when the fact became invalid
60
+ # @param mentions [Array<Hash>] entity mentions
61
+ # @param confidence [Float] confidence score (0.0 to 1.0)
62
+ # @param metadata [Hash] additional metadata
63
+ # @return [Hash] the fact hash
32
64
  def create_fact(text:, valid_at:, invalid_at: nil, mentions: [], confidence: 1.0, metadata: {})
33
65
  extract(text, {
34
66
  valid_at: valid_at,
@@ -39,7 +71,15 @@ module FactDb
39
71
  }).first
40
72
  end
41
73
 
42
- # Convenience method for creating an entity
74
+ # Creates an entity hash
75
+ #
76
+ # Convenience method for building entity data manually.
77
+ #
78
+ # @param name [String] the entity name
79
+ # @param type [String, Symbol] entity kind (person, organization, etc.)
80
+ # @param aliases [Array<String>] alternative names
81
+ # @param attributes [Hash] additional attributes
82
+ # @return [Hash] the entity hash
43
83
  def create_entity(name:, type:, aliases: [], attributes: {})
44
84
  build_entity(
45
85
  name: name,
@@ -2,8 +2,18 @@
2
2
 
3
3
  module FactDb
4
4
  module Extractors
5
+ # Rule-based fact extractor using regex patterns
6
+ #
7
+ # Extracts facts from text using predefined regex patterns for common
8
+ # fact types like employment, relationships, and locations. Does not
9
+ # require an LLM but is limited to recognized patterns.
10
+ #
11
+ # @example Extract facts using patterns
12
+ # extractor = RuleBasedExtractor.new
13
+ # facts = extractor.extract("Paula works at Microsoft in Seattle")
14
+ #
5
15
  class RuleBasedExtractor < Base
6
- # Date patterns for temporal extraction
16
+ # @return [Array<Regexp>] patterns for extracting start dates
7
17
  DATE_PATTERNS = [
8
18
  # "on January 10, 2024"
9
19
  /(?:on|since|from|as of|starting)\s+(\w+\s+\d{1,2},?\s+\d{4})/i,
@@ -15,40 +25,50 @@ module FactDb
15
25
  /(?:in|during)\s+(\d{4})\b/i
16
26
  ].freeze
17
27
 
28
+ # @return [Array<Regexp>] patterns for extracting end dates
18
29
  END_DATE_PATTERNS = [
19
30
  # "until January 10, 2024"
20
31
  /(?:until|through|to|ended|left)\s+(\w+\s+\d{1,2},?\s+\d{4})/i,
21
32
  /(?:until|through|to|ended|left)\s+(\d{4}-\d{2}-\d{2})/i
22
33
  ].freeze
23
34
 
24
- # Employment patterns
35
+ # @return [Array<Regexp>] patterns for employment facts
25
36
  EMPLOYMENT_PATTERNS = [
26
37
  # "Paula works at Microsoft"
27
- /(\b[A-Z][a-z]+(?:\s+[A-Z][a-z]+)*)\s+(?:works?|worked|is working)\s+(?:at|for)\s+(\b[A-Z][A-Za-z]+(?:\s+[A-Z][A-Za-z]+)*)/,
38
+ /(\b[A-Z][a-z]+(?:[ ]+[A-Z][a-z]+)*)\b[ ]+(?:works?|worked|is working)[ ]+(?:at|for)[ ]+(\b[A-Z][A-Za-z]+(?:[ ]+[A-Z][A-Za-z]+)*)\b/,
28
39
  # "Paula joined Microsoft"
29
- /(\b[A-Z][a-z]+(?:\s+[A-Z][a-z]+)*)\s+(?:joined|started at|was hired by)\s+(\b[A-Z][A-Za-z]+(?:\s+[A-Z][A-Za-z]+)*)/,
40
+ /(\b[A-Z][a-z]+(?:[ ]+[A-Z][a-z]+)*)\b[ ]+(?:joined|started at|was hired by)[ ]+(\b[A-Z][A-Za-z]+(?:[ ]+[A-Z][A-Za-z]+)*)\b/,
30
41
  # "Paula left Microsoft"
31
- /(\b[A-Z][a-z]+(?:\s+[A-Z][a-z]+)*)\s+(?:left|departed|resigned from|was fired from)\s+(\b[A-Z][A-Za-z]+(?:\s+[A-Z][A-Za-z]+)*)/,
42
+ /(\b[A-Z][a-z]+(?:[ ]+[A-Z][a-z]+)*)\b[ ]+(?:left|departed|resigned from|was fired from)[ ]+(\b[A-Z][A-Za-z]+(?:[ ]+[A-Z][A-Za-z]+)*)\b/,
32
43
  # "Paula is a Principal Engineer at Microsoft"
33
- /(\b[A-Z][a-z]+(?:\s+[A-Z][a-z]+)*)\s+(?:is|was|became)\s+(?:a\s+)?([A-Z][A-Za-z]+(?:\s+[A-Z][A-Za-z]+)*)\s+at\s+(\b[A-Z][A-Za-z]+(?:\s+[A-Z][A-Za-z]+)*)/
44
+ /(\b[A-Z][a-z]+(?:[ ]+[A-Z][a-z]+)*)\b[ ]+(?:is|was|became)[ ]+(?:a[ ]+)?([A-Z][A-Za-z]+(?:[ ]+[A-Z][A-Za-z]+)*)[ ]+at[ ]+(\b[A-Z][A-Za-z]+(?:[ ]+[A-Z][A-Za-z]+)*)\b/
34
45
  ].freeze
35
46
 
36
- # Relationship patterns
47
+ # @return [Array<Regexp>] patterns for relationship facts
37
48
  RELATIONSHIP_PATTERNS = [
38
49
  # "Paula is married to John"
39
- /(\b[A-Z][a-z]+(?:\s+[A-Z][a-z]+)*)\s+(?:is|was)\s+(?:married to|engaged to|dating)\s+(\b[A-Z][a-z]+(?:\s+[A-Z][a-z]+)*)/,
50
+ /(\b[A-Z][a-z]+(?:[ ]+[A-Z][a-z]+)*)\b[ ]+(?:is|was)[ ]+(?:married to|engaged to|dating)[ ]+(\b[A-Z][a-z]+(?:[ ]+[A-Z][a-z]+)*)\b/,
40
51
  # "Paula is the CEO of Microsoft"
41
- /(\b[A-Z][a-z]+(?:\s+[A-Z][a-z]+)*)\s+(?:is|was)\s+(?:the\s+)?(\w+(?:\s+\w+)*)\s+of\s+(\b[A-Z][A-Za-z]+(?:\s+[A-Z][A-Za-z]+)*)/
52
+ /(\b[A-Z][a-z]+(?:[ ]+[A-Z][a-z]+)*)\b[ ]+(?:is|was)[ ]+(?:the[ ]+)?(\w+(?:[ ]+\w+)*)[ ]+of[ ]+(\b[A-Z][A-Za-z]+(?:[ ]+[A-Z][A-Za-z]+)*)\b/
42
53
  ].freeze
43
54
 
44
- # Location patterns
55
+ # @return [Array<Regexp>] patterns for location facts
45
56
  LOCATION_PATTERNS = [
46
- # "Paula lives in Seattle"
47
- /(\b[A-Z][a-z]+(?:\s+[A-Z][a-z]+)*)\s+(?:lives?|lived|is based|was based|relocated)\s+(?:in|to)\s+(\b[A-Z][A-Za-z]+(?:,?\s+[A-Z]{2})?)/,
48
- # "Microsoft is headquartered in Redmond"
49
- /(\b[A-Z][A-Za-z]+(?:\s+[A-Z][A-Za-z]+)*)\s+(?:is|was)\s+(?:headquartered|located|based)\s+in\s+(\b[A-Z][A-Za-z]+(?:,?\s+[A-Z]{2})?)/
57
+ # "Paula lives in Seattle" or "Bob lives in New York City"
58
+ /(\b[A-Z][a-z]+(?:[ ]+[A-Z][a-z]+)*)\b[ ]+(?:lives?|lived|is based|was based|relocated|moved)[ ]+(?:in|to)[ ]+(\b[A-Z][A-Za-z]+(?:[ ]+[A-Z][A-Za-z]+)*(?:,[ ]+[A-Z]{2})?)\b/,
59
+ # "Microsoft is headquartered in Redmond" or "in Seattle, Washington"
60
+ /(\b[A-Z][A-Za-z]+(?:[ ]+[A-Z][A-Za-z]+)*)\b[ ]+(?:is|was)[ ]+(?:headquartered|located|based)[ ]+in[ ]+(\b[A-Z][A-Za-z]+(?:[ ]+[A-Z][A-Za-z]+)*(?:,[ ]+[A-Z][A-Za-z]+)?)\b/
50
61
  ].freeze
51
62
 
63
+ # Extracts facts from text using regex patterns
64
+ #
65
+ # Applies employment, relationship, and location patterns to identify
66
+ # facts, with associated entity mentions and temporal information.
67
+ #
68
+ # @param text [String] raw text to extract from
69
+ # @param context [Hash] additional context
70
+ # @option context [Date, Time] :captured_at default timestamp for facts
71
+ # @return [Array<Hash>] array of fact hashes, deduplicated by text
52
72
  def extract(text, context = {})
53
73
  return [] if text.nil? || text.strip.empty?
54
74
 
@@ -66,23 +86,35 @@ module FactDb
66
86
  facts.uniq { |f| f[:text] }
67
87
  end
68
88
 
89
+ # Extracts entities from text using regex patterns
90
+ #
91
+ # Identifies person names, organization names, and locations using
92
+ # pattern matching. Filters out common words, job titles, and known phrases.
93
+ #
94
+ # @param text [String] raw text to extract from
95
+ # @return [Array<Hash>] array of entity hashes with :name and :kind
69
96
  def extract_entities(text)
70
97
  return [] if text.nil? || text.strip.empty?
71
98
 
72
99
  entities = []
73
100
 
74
- # Extract person names (simple capitalized word sequences)
75
- text.scan(/\b([A-Z][a-z]+(?:\s+[A-Z][a-z]+)+)\b/).flatten.uniq.each do |name|
101
+ # Extract person names (capitalized word sequences on same line)
102
+ # Use [ ]+ instead of \s+ to avoid matching across newlines
103
+ text.scan(/\b([A-Z][a-z]+(?:[ ]+[A-Z][a-z]+)+)\b/).flatten.uniq.each do |name|
76
104
  next if common_word?(name)
105
+ next if job_title?(name)
106
+ next if common_phrase?(name)
107
+ next if known_place?(name)
108
+ next if organization_indicator?(name)
77
109
 
78
- entities << build_entity(name: name, type: "person")
110
+ entities << build_entity(name: name, kind: "person")
79
111
  end
80
112
 
81
113
  # Extract organization names (from employment patterns)
82
114
  EMPLOYMENT_PATTERNS.each do |pattern|
83
115
  text.scan(pattern).each do |match|
84
116
  org_name = match.last
85
- entities << build_entity(name: org_name, type: "organization") unless common_word?(org_name)
117
+ entities << build_entity(name: org_name, kind: "organization") unless common_word?(org_name)
86
118
  end
87
119
  end
88
120
 
@@ -90,7 +122,7 @@ module FactDb
90
122
  LOCATION_PATTERNS.each do |pattern|
91
123
  text.scan(pattern).each do |match|
92
124
  location = match.last
93
- entities << build_entity(name: location, type: "place") unless common_word?(location)
125
+ entities << build_entity(name: location, kind: "place") unless common_word?(location)
94
126
  end
95
127
  end
96
128
 
@@ -116,13 +148,13 @@ module FactDb
116
148
  invalid_at = is_termination ? (extract_end_date(text) || default_date) : nil
117
149
 
118
150
  mentions = [
119
- build_mention(name: person, type: "person", role: "subject"),
120
- build_mention(name: org, type: "organization", role: "object")
151
+ build_mention(name: person, kind: "person", role: "subject"),
152
+ build_mention(name: org, kind: "organization", role: "object")
121
153
  ]
122
154
 
123
155
  # Add role if present
124
156
  if rest.length > 1
125
- mentions << build_mention(name: rest[0], type: "concept", role: "instrument")
157
+ mentions << build_mention(name: rest[0], kind: "concept", role: "instrument")
126
158
  end
127
159
 
128
160
  facts << build_fact(
@@ -148,7 +180,7 @@ module FactDb
148
180
 
149
181
  mentions = match.map.with_index do |name, i|
150
182
  role = i.zero? ? "subject" : "object"
151
- build_mention(name: name, type: "person", role: role)
183
+ build_mention(name: name, kind: "person", role: role)
152
184
  end
153
185
 
154
186
  facts << build_fact(
@@ -177,8 +209,8 @@ module FactDb
177
209
  entity_type = text.match?(/#{Regexp.escape(entity_name)}\s+(?:lives?|lived)/i) ? "person" : "organization"
178
210
 
179
211
  mentions = [
180
- build_mention(name: entity_name, type: entity_type, role: "subject"),
181
- build_mention(name: location, type: "place", role: "location")
212
+ build_mention(name: entity_name, kind: entity_type, role: "subject"),
213
+ build_mention(name: location, kind: "place", role: "location")
182
214
  ]
183
215
 
184
216
  facts << build_fact(
@@ -223,6 +255,85 @@ module FactDb
223
255
  ]
224
256
  common_words.any? { |w| w.casecmp?(word) }
225
257
  end
258
+
259
+ def job_title?(text)
260
+ # Common job title words that indicate this is a role, not a person name
261
+ title_indicators = %w[
262
+ Chief Executive Officer Director Manager Engineer Developer
263
+ President Vice Principal Senior Junior Lead Head
264
+ Analyst Coordinator Administrator Assistant Specialist
265
+ Consultant Architect Designer Technician Supervisor
266
+ CTO CEO CFO COO CMO CIO CPO
267
+ VP SVP EVP
268
+ ]
269
+
270
+ words = text.split(/\s+/)
271
+
272
+ # If any word is a title indicator, it's likely a job title
273
+ words.any? { |word| title_indicators.any? { |t| t.casecmp?(word) } }
274
+ end
275
+
276
+ def common_phrase?(text)
277
+ # Common document phrases that are not person names
278
+ phrases = [
279
+ /Team\s+Updates?/i,
280
+ /Action\s+Items?/i,
281
+ /Meeting\s+Notes?/i,
282
+ /Status\s+Meeting/i,
283
+ /Project\s+Status/i,
284
+ /Human\s+Resources?/i,
285
+ /Best\s+Regards?/i,
286
+ /Immediate\s+Release/i,
287
+ /New\s+Leadership/i,
288
+ /Appoints?\s+New/i,
289
+ /Recent\s+\w+/i,
290
+ /Please\s+\w+/i
291
+ ]
292
+
293
+ phrases.any? { |pattern| text.match?(pattern) }
294
+ end
295
+
296
+ def known_place?(text)
297
+ # Common city/place names or location indicators
298
+ place_indicators = %w[
299
+ City County State Province District Region
300
+ Beach Park Heights Hills Valley Springs Lake
301
+ Island Harbor Port
302
+ ]
303
+
304
+ # Common multi-word US city names
305
+ known_cities = [
306
+ "New York", "Los Angeles", "San Francisco", "San Diego", "San Jose",
307
+ "San Antonio", "Las Vegas", "Salt Lake", "New Orleans", "Fort Worth",
308
+ "Fort Lauderdale", "St Louis", "St Paul", "El Paso", "Santa Fe",
309
+ "Santa Monica", "Palm Beach", "Long Beach", "Virginia Beach"
310
+ ]
311
+
312
+ words = text.split(/\s+/)
313
+
314
+ # Check for place indicator words
315
+ return true if words.any? { |word| place_indicators.any? { |p| p.casecmp?(word) } }
316
+
317
+ # Check for known city names
318
+ known_cities.any? { |city| text.casecmp?(city) || text.start_with?("#{city} ") }
319
+ end
320
+
321
+ def organization_indicator?(text)
322
+ # Words that indicate an organization, not a person
323
+ org_indicators = %w[
324
+ Solutions Technologies Systems Services Group
325
+ Partners Associates Consulting Agency
326
+ Industries Enterprises Holdings Ventures
327
+ Foundation Institute University College
328
+ Global International National Regional
329
+ Tech Corp Labs
330
+ ]
331
+
332
+ words = text.split(/\s+/)
333
+
334
+ # If any word is an org indicator, it's likely an organization
335
+ words.any? { |word| org_indicators.any? { |o| o.casecmp?(word) } }
336
+ end
226
337
  end
227
338
  end
228
339
  end
@@ -29,9 +29,9 @@ module FactDb
29
29
  # # llm_api_key: <%= ENV["ANTHROPIC_API_KEY"] %>
30
30
  #
31
31
  # @example Configure via environment variables
32
- # # EVENT_CLOCK_LLM_PROVIDER=anthropic
33
- # # EVENT_CLOCK_LLM_MODEL=claude-sonnet-4-20250514
34
- # # EVENT_CLOCK_LLM_API_KEY=sk-...
32
+ # # FACT_DB_LLM_PROVIDER=anthropic
33
+ # # FACT_DB_LLM_MODEL=claude-sonnet-4-20250514
34
+ # # FACT_DB_LLM_API_KEY=sk-...
35
35
  #
36
36
  class Adapter
37
37
  attr_reader :model, :provider