RubyGems - fact_db - Versions diffs - 0.0.2 → 0.0.3 - Mend

fact_db 0.0.2 → 0.0.3

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (114) hide show

checksums.yaml +4 -4
data/.envrc +2 -0
data/.yardopts +5 -0
data/CHANGELOG.md +64 -0
data/README.md +107 -6
data/Rakefile +243 -10
data/db/migrate/001_enable_extensions.rb +1 -0
data/db/migrate/002_create_sources.rb +49 -0
data/db/migrate/003_create_entities.rb +27 -15
data/db/migrate/004_create_entity_aliases.rb +20 -7
data/db/migrate/005_create_facts.rb +37 -21
data/db/migrate/006_create_entity_mentions.rb +14 -6
data/db/migrate/007_create_fact_sources.rb +16 -8
data/docs/api/extractors/index.md +5 -5
data/docs/api/extractors/llm.md +17 -17
data/docs/api/extractors/rule-based.md +14 -14
data/docs/api/facts.md +20 -20
data/docs/api/index.md +4 -4
data/docs/api/models/entity.md +21 -21
data/docs/api/models/fact.md +15 -15
data/docs/api/models/index.md +7 -7
data/docs/api/models/{content.md → source.md} +29 -29
data/docs/api/pipeline/extraction.md +25 -25
data/docs/api/pipeline/index.md +1 -1
data/docs/api/pipeline/resolution.md +4 -4
data/docs/api/services/entity-service.md +20 -20
data/docs/api/services/fact-service.md +12 -12
data/docs/api/services/index.md +5 -5
data/docs/api/services/{content-service.md → source-service.md} +27 -27
data/docs/architecture/database-schema.md +46 -46
data/docs/architecture/entity-resolution.md +6 -6
data/docs/architecture/index.md +10 -10
data/docs/architecture/temporal-facts.md +5 -5
data/docs/architecture/three-layer-model.md +17 -17
data/docs/concepts.md +6 -6
data/docs/examples/basic-usage.md +20 -20
data/docs/examples/hr-onboarding.md +17 -17
data/docs/examples/index.md +4 -4
data/docs/examples/news-analysis.md +23 -23
data/docs/getting-started/database-setup.md +28 -20
data/docs/getting-started/index.md +3 -3
data/docs/getting-started/quick-start.md +33 -30
data/docs/guides/batch-processing.md +26 -26
data/docs/guides/configuration.md +158 -77
data/docs/guides/entity-management.md +14 -14
data/docs/guides/extracting-facts.md +28 -28
data/docs/guides/ingesting-content.md +14 -14
data/docs/guides/llm-integration.md +40 -32
data/docs/guides/temporal-queries.md +11 -11
data/docs/index.md +6 -2
data/examples/.envrc +4 -0
data/examples/.gitignore +1 -0
data/examples/001_configuration.rb +312 -0
data/examples/{basic_usage.rb → 010_basic_usage.rb} +47 -56
data/examples/{entity_management.rb → 020_entity_management.rb} +57 -72
data/examples/{temporal_queries.rb → 030_temporal_queries.rb} +39 -59
data/examples/040_output_formats.rb +177 -0
data/examples/{rule_based_extraction.rb → 050_rule_based_extraction.rb} +39 -45
data/examples/060_fluent_temporal_api.rb +217 -0
data/examples/070_introspection.rb +252 -0
data/examples/{hr_system.rb → 080_hr_system.rb} +56 -75
data/examples/090_ingest_demo.rb +515 -0
data/examples/100_query_context.rb +668 -0
data/examples/110_prove_it.rb +204 -0
data/examples/120_dump_database.rb +358 -0
data/examples/130_rag_feedback_loop.rb +858 -0
data/examples/README.md +229 -15
data/examples/data/lincoln_associates.md +201 -0
data/examples/data/lincoln_biography.md +66 -0
data/examples/data/lincoln_cabinet.md +243 -0
data/examples/data/lincoln_family.md +163 -0
data/examples/data/lincoln_military.md +241 -0
data/examples/data/lincoln_todd_family.md +136 -0
data/examples/ingest_reporter.rb +335 -0
data/examples/utilities.rb +182 -0
data/lib/fact_db/config/defaults.yml +254 -0
data/lib/fact_db/config.rb +94 -35
data/lib/fact_db/database.rb +98 -8
data/lib/fact_db/extractors/base.rb +106 -21
data/lib/fact_db/extractors/llm_extractor.rb +35 -63
data/lib/fact_db/extractors/manual_extractor.rb +46 -6
data/lib/fact_db/extractors/rule_based_extractor.rb +136 -25
data/lib/fact_db/llm/adapter.rb +3 -3
data/lib/fact_db/models/entity.rb +94 -22
data/lib/fact_db/models/entity_alias.rb +41 -7
data/lib/fact_db/models/entity_mention.rb +34 -1
data/lib/fact_db/models/fact.rb +259 -28
data/lib/fact_db/models/fact_source.rb +43 -9
data/lib/fact_db/models/source.rb +113 -0
data/lib/fact_db/pipeline/extraction_pipeline.rb +35 -35
data/lib/fact_db/pipeline/resolution_pipeline.rb +5 -5
data/lib/fact_db/query_result.rb +202 -0
data/lib/fact_db/resolution/entity_resolver.rb +139 -39
data/lib/fact_db/resolution/fact_resolver.rb +86 -14
data/lib/fact_db/services/entity_service.rb +246 -37
data/lib/fact_db/services/fact_service.rb +254 -17
data/lib/fact_db/services/source_service.rb +164 -0
data/lib/fact_db/temporal/query.rb +71 -7
data/lib/fact_db/temporal/query_builder.rb +69 -0
data/lib/fact_db/temporal/timeline.rb +102 -11
data/lib/fact_db/transformers/base.rb +77 -0
data/lib/fact_db/transformers/cypher_transformer.rb +185 -0
data/lib/fact_db/transformers/json_transformer.rb +17 -0
data/lib/fact_db/transformers/raw_transformer.rb +35 -0
data/lib/fact_db/transformers/text_transformer.rb +114 -0
data/lib/fact_db/transformers/triple_transformer.rb +138 -0
data/lib/fact_db/validation/alias_filter.rb +185 -0
data/lib/fact_db/version.rb +1 -1
data/lib/fact_db.rb +281 -30
data/mkdocs.yml +2 -2
metadata +60 -16
data/db/migrate/002_create_contents.rb +0 -44
data/lib/fact_db/models/content.rb +0 -62
data/lib/fact_db/services/content_service.rb +0 -93

data/lib/fact_db/extractors/base.rb CHANGED Viewed

@@ -2,34 +2,73 @@
 module FactDb
   module Extractors
+    # Abstract base class for fact extractors
+    #
+    # Provides common interface and helper methods for extracting facts and entities
+    # from text. Subclasses must implement #extract and #extract_entities.
+    #
+    # @abstract Subclass and override {#extract} and {#extract_entities} to implement.
+    #
+    # @example Create a custom extractor
+    #   class MyExtractor < FactDb::Extractors::Base
+    #     def extract(text, context = {})
+    #       # Implementation
+    #     end
+    #
+    #     def extract_entities(text)
+    #       # Implementation
+    #     end
+    #   end
+    #
     class Base
+      # @return [FactDb::Config] the configuration object
       attr_reader :config
+      # Initializes a new extractor
+      #
+      # @param config [FactDb::Config] configuration object (defaults to FactDb.config)
       def initialize(config = FactDb.config)
         @config = config
       end
-      # Extract facts from text
-      # @param text [String] Raw text to extract from
-      # @param context [Hash] Additional context (captured_at, source_uri, etc.)
-      # @return [Array<Hash>] Array of fact data hashes
+      # Extracts facts from text
+      #
+      # @abstract Subclass and override this method
+      # @param text [String] raw text to extract from
+      # @param context [Hash] additional context (captured_at, source_uri, etc.)
+      # @return [Array<Hash>] array of fact data hashes
+      # @raise [NotImplementedError] if not implemented by subclass
       def extract(text, context = {})
         raise NotImplementedError, "#{self.class} must implement #extract"
       end
-      # Extract entities from text
-      # @param text [String] Raw text to extract from
-      # @return [Array<Hash>] Array of { name:, type:, aliases: }
+      # Extracts entities from text
+      #
+      # @abstract Subclass and override this method
+      # @param text [String] raw text to extract from
+      # @return [Array<Hash>] array of entity hashes with :name, :kind, :aliases
+      # @raise [NotImplementedError] if not implemented by subclass
       def extract_entities(text)
         raise NotImplementedError, "#{self.class} must implement #extract_entities"
       end
-      # Get the extraction method name
+      # Returns the extraction method name derived from class name
+      #
+      # @return [String] method name (e.g., "manual", "llm", "rule_based")
       def extraction_method
         self.class.name.split("::").last.sub("Extractor", "").underscore
       end
       class << self
+        # Factory method to create an extractor by type
+        #
+        # @param type [Symbol, String] extractor type (:manual, :llm, :rule_based)
+        # @param config [FactDb::Config] configuration object
+        # @return [Base] an extractor instance
+        # @raise [ArgumentError] if type is unknown
+        #
+        # @example
+        #   extractor = FactDb::Extractors::Base.for(:llm)
         def for(type, config = FactDb.config)
           case type.to_sym
           when :manual
@@ -43,6 +82,9 @@ module FactDb
           end
         end
+        # Returns list of available extractor types
+        #
+        # @return [Array<Symbol>] available extractor type symbols
         def available_types
           %i[manual llm rule_based]
         end
@@ -50,7 +92,12 @@ module FactDb
       protected
-      # Parse a date string, returning nil if invalid
+      # Parses a date string, returning nil if invalid
+      #
+      # Supports natural language parsing via Chronic if available.
+      #
+      # @param date_str [String, nil] date string to parse
+      # @return [Date, nil] parsed date or nil if invalid
       def parse_date(date_str)
         return nil if date_str.nil? || date_str.to_s.empty?
@@ -65,7 +112,12 @@ module FactDb
         nil
       end
-      # Parse a timestamp string, returning nil if invalid
+      # Parses a timestamp string, returning nil if invalid
+      #
+      # Supports natural language parsing via Chronic if available.
+      #
+      # @param timestamp_str [String, nil] timestamp string to parse
+      # @return [Time, nil] parsed time or nil if invalid
       def parse_timestamp(timestamp_str)
         return nil if timestamp_str.nil? || timestamp_str.to_s.empty?
@@ -80,7 +132,15 @@ module FactDb
         nil
       end
-      # Build a standardized fact hash
+      # Builds a standardized fact hash
+      #
+      # @param text [String] the fact text
+      # @param valid_at [Date, Time] when the fact became valid
+      # @param invalid_at [Date, Time, nil] when the fact became invalid
+      # @param mentions [Array<Hash>] entity mentions
+      # @param confidence [Float] confidence score (0.0 to 1.0)
+      # @param metadata [Hash] additional metadata
+      # @return [Hash] standardized fact hash for persistence
       def build_fact(text:, valid_at:, invalid_at: nil, mentions: [], confidence: 1.0, metadata: {})
         {
           text: text.strip,
@@ -93,23 +153,48 @@ module FactDb
         }
       end
-      # Build a standardized entity hash
-      def build_entity(name:, type:, aliases: [], attributes: {})
+      # Builds a standardized entity hash
+      #
+      # Automatically filters aliases through AliasFilter.
+      #
+      # @param name [String] the entity name
+      # @param kind [String, Symbol] entity kind (person, organization, etc.)
+      # @param aliases [Array<String>] alternative names
+      # @param attributes [Hash] additional attributes
+      # @return [Hash] standardized entity hash
+      def build_entity(name:, kind:, aliases: [], attributes: {})
+        canonical_name = name.strip
+        filtered_aliases = Validation::AliasFilter.filter(aliases, name: canonical_name)
         {
-          name: name.strip,
-          type: type.to_s,
-          aliases: aliases.map(&:strip),
+          name: canonical_name,
+          kind: kind.to_s,
+          aliases: filtered_aliases,
           attributes: attributes
         }
       end
-      # Build a standardized mention hash
-      def build_mention(name:, type:, role: nil, confidence: 1.0)
+      # Builds a standardized entity mention hash
+      #
+      # Automatically filters aliases through AliasFilter.
+      #
+      # @param name [String] the entity name
+      # @param kind [String, Symbol] entity kind
+      # @param role [String, Symbol, nil] mention role (subject, object, etc.)
+      # @param confidence [Float] confidence score (0.0 to 1.0)
+      # @param aliases [Array<String>] alternative names
+      # @return [Hash] standardized mention hash
+      def build_mention(name:, kind:, role: nil, confidence: 1.0, aliases: [])
+        canonical_name = name.strip
+        raw_aliases = Array(aliases).map { |a| a.to_s.strip }.reject(&:empty?)
+        filtered_aliases = Validation::AliasFilter.filter(raw_aliases, name: canonical_name)
         {
-          name: name.strip,
-          type: type.to_s,
+          name: canonical_name,
+          kind: kind.to_s,
           role: role&.to_s,
-          confidence: confidence
+          confidence: confidence,
+          aliases: filtered_aliases
         }
       end
     end

data/lib/fact_db/extractors/llm_extractor.rb CHANGED Viewed

@@ -4,84 +4,55 @@ require "json"
 module FactDb
   module Extractors
+    # LLM-based fact extractor using language models
+    #
+    # Uses a configured LLM client to extract atomic facts and entities from
+    # unstructured text. Parses JSON responses from the LLM and builds
+    # standardized fact/entity hashes.
+    #
+    # @example Extract facts using LLM
+    #   FactDb.configure { |c| c.llm_client = MyLLMClient.new }
+    #   extractor = LLMExtractor.new
+    #   facts = extractor.extract("Paula joined Microsoft on January 10, 2024...")
+    #
     class LLMExtractor < Base
-      FACT_EXTRACTION_PROMPT = <<~PROMPT
-        Extract factual assertions from the following text. For each fact:
-        1. State the assertion clearly and concisely
-        2. Identify when it became true (valid_at) if mentioned
-        3. Identify when it stopped being true (invalid_at) if mentioned
-        4. Identify entities mentioned (people, organizations, places, products)
-        5. Assign a confidence score (0.0 to 1.0) based on how explicitly stated the fact is
-        Text:
-        %<text>s
-        Return as a JSON array with this structure:
-        [
-          {
-            "text": "Paula works at Microsoft as Principal Engineer",
-            "valid_at": "2024-01-10",
-            "invalid_at": null,
-            "confidence": 0.95,
-            "mentions": [
-              {"name": "Paula", "type": "person", "role": "subject"},
-              {"name": "Microsoft", "type": "organization", "role": "object"}
-            ]
-          }
-        ]
-        Rules:
-        - Extract only factual assertions, not opinions or speculation
-        - Use ISO 8601 date format (YYYY-MM-DD) when possible
-        - Set invalid_at to null if the fact is still true or unknown
-        - Set valid_at to null if the timing is not mentioned
-        - Entity types: person, organization, place, product, event, concept
-        - Roles: subject, object, location, temporal, instrument, beneficiary
-        Return only valid JSON, no additional text.
-      PROMPT
-      ENTITY_EXTRACTION_PROMPT = <<~PROMPT
-        Extract all named entities from the following text.
-        For each entity:
-        1. Identify the canonical name
-        2. Classify the type (person, organization, place, product, event, concept)
-        3. List any aliases or alternative names mentioned
-        Text:
-        %<text>s
-        Return as a JSON array:
-        [
-          {
-            "name": "Paula Chen",
-            "type": "person",
-            "aliases": ["Paula", "P. Chen"]
-          }
-        ]
-        Return only valid JSON, no additional text.
-      PROMPT
+      # Extracts atomic facts from text using the configured LLM
+      #
+      # Prompts the LLM to identify factual assertions, temporal information,
+      # entity mentions with roles, and confidence scores.
+      #
+      # @param text [String] raw text to extract from
+      # @param context [Hash] additional context
+      # @option context [Date, Time] :captured_at default timestamp for facts
+      # @return [Array<Hash>] array of fact hashes
+      # @raise [ConfigurationError] if no LLM client is configured
       def extract(text, context = {})
         return [] if text.nil? || text.strip.empty?
         client = config.llm_client
         raise ConfigurationError, "LLM client not configured" unless client
-        prompt = format(FACT_EXTRACTION_PROMPT, text: text)
+        prompt = format(config.prompts.fact_extraction, text: text)
         response = call_llm(client, prompt)
         parse_fact_response(response, context)
       end
+      # Extracts entities from text using the configured LLM
+      #
+      # Prompts the LLM to identify named entities, classify their types,
+      # and list any aliases or alternative names.
+      #
+      # @param text [String] raw text to extract from
+      # @return [Array<Hash>] array of entity hashes with :name, :kind, :aliases
+      # @raise [ConfigurationError] if no LLM client is configured
       def extract_entities(text)
         return [] if text.nil? || text.strip.empty?
         client = config.llm_client
         raise ConfigurationError, "LLM client not configured" unless client
-        prompt = format(ENTITY_EXTRACTION_PROMPT, text: text)
+        prompt = format(config.prompts.entity_extraction, text: text)
         response = call_llm(client, prompt)
         parse_entity_response(response)
@@ -135,7 +106,7 @@ module FactDb
         parsed.map do |entity_data|
           build_entity(
             name: entity_data["name"],
-            type: entity_data["type"] || "concept",
+            kind: entity_data["type"] || "concept",
             aliases: entity_data["aliases"] || [],
             attributes: entity_data["attributes"] || {}
           )
@@ -151,9 +122,10 @@ module FactDb
         mentions_data.map do |mention|
           build_mention(
             name: mention["name"],
-            type: mention["type"] || "concept",
+            kind: mention["type"] || "concept",
             role: mention["role"],
-            confidence: mention["confidence"]&.to_f || 1.0
+            confidence: mention["confidence"]&.to_f || 1.0,
+            aliases: mention["aliases"] || []
           )
         end
       end

data/lib/fact_db/extractors/manual_extractor.rb CHANGED Viewed

@@ -2,10 +2,29 @@
 module FactDb
   module Extractors
+    # Manual fact extractor for API-driven fact creation
+    #
+    # Passes through user-provided text as a single fact without any
+    # automated extraction. Used when the user provides fact text and
+    # metadata directly via the API.
+    #
+    # @example Extract a manual fact
+    #   extractor = ManualExtractor.new
+    #   facts = extractor.extract("John works at Acme", valid_at: Date.today)
+    #
     class ManualExtractor < Base
-      # Manual extraction passes through the text as a single fact
-      # This is used for API-driven fact creation where the user
-      # provides the fact text and metadata directly
+      # Extracts a single fact from the provided text
+      #
+      # Returns the text as-is without parsing. All metadata comes from context.
+      #
+      # @param text [String] the fact text
+      # @param context [Hash] fact metadata
+      # @option context [Date, Time] :valid_at when the fact became valid
+      # @option context [Date, Time] :invalid_at when the fact became invalid
+      # @option context [Array<Hash>] :mentions entity mentions
+      # @option context [Float] :confidence confidence score
+      # @option context [Hash] :metadata additional metadata
+      # @return [Array<Hash>] array with single fact hash, or empty if text is blank
       def extract(text, context = {})
         return [] if text.nil? || text.strip.empty?
@@ -23,12 +42,25 @@ module FactDb
         ]
       end
-      # Manual extraction expects entities to be provided explicitly
+      # Returns empty array since manual extraction expects entities to be provided
+      #
+      # @param text [String] ignored
+      # @return [Array] empty array
       def extract_entities(text)
         []
       end
-      # Convenience method for creating a single fact with full control
+      # Creates a single fact with full control over all attributes
+      #
+      # Convenience method that wraps #extract with named parameters.
+      #
+      # @param text [String] the fact text
+      # @param valid_at [Date, Time] when the fact became valid
+      # @param invalid_at [Date, Time, nil] when the fact became invalid
+      # @param mentions [Array<Hash>] entity mentions
+      # @param confidence [Float] confidence score (0.0 to 1.0)
+      # @param metadata [Hash] additional metadata
+      # @return [Hash] the fact hash
       def create_fact(text:, valid_at:, invalid_at: nil, mentions: [], confidence: 1.0, metadata: {})
         extract(text, {
           valid_at: valid_at,
@@ -39,7 +71,15 @@ module FactDb
         }).first
       end
-      # Convenience method for creating an entity
+      # Creates an entity hash
+      #
+      # Convenience method for building entity data manually.
+      #
+      # @param name [String] the entity name
+      # @param type [String, Symbol] entity kind (person, organization, etc.)
+      # @param aliases [Array<String>] alternative names
+      # @param attributes [Hash] additional attributes
+      # @return [Hash] the entity hash
       def create_entity(name:, type:, aliases: [], attributes: {})
         build_entity(
           name: name,

data/lib/fact_db/extractors/rule_based_extractor.rb CHANGED Viewed

@@ -2,8 +2,18 @@
 module FactDb
   module Extractors
+    # Rule-based fact extractor using regex patterns
+    #
+    # Extracts facts from text using predefined regex patterns for common
+    # fact types like employment, relationships, and locations. Does not
+    # require an LLM but is limited to recognized patterns.
+    #
+    # @example Extract facts using patterns
+    #   extractor = RuleBasedExtractor.new
+    #   facts = extractor.extract("Paula works at Microsoft in Seattle")
+    #
     class RuleBasedExtractor < Base
-      # Date patterns for temporal extraction
+      # @return [Array<Regexp>] patterns for extracting start dates
       DATE_PATTERNS = [
         # "on January 10, 2024"
         /(?:on|since|from|as of|starting)\s+(\w+\s+\d{1,2},?\s+\d{4})/i,
@@ -15,40 +25,50 @@ module FactDb
         /(?:in|during)\s+(\d{4})\b/i
       ].freeze
+      # @return [Array<Regexp>] patterns for extracting end dates
       END_DATE_PATTERNS = [
         # "until January 10, 2024"
         /(?:until|through|to|ended|left)\s+(\w+\s+\d{1,2},?\s+\d{4})/i,
         /(?:until|through|to|ended|left)\s+(\d{4}-\d{2}-\d{2})/i
       ].freeze
-      # Employment patterns
+      # @return [Array<Regexp>] patterns for employment facts
       EMPLOYMENT_PATTERNS = [
         # "Paula works at Microsoft"
-        /(\b[A-Z][a-z]+(?:\s+[A-Z][a-z]+)*)\s+(?:works?|worked|is working)\s+(?:at|for)\s+(\b[A-Z][A-Za-z]+(?:\s+[A-Z][A-Za-z]+)*)/,
+        /(\b[A-Z][a-z]+(?:[ ]+[A-Z][a-z]+)*)\b[ ]+(?:works?|worked|is working)[ ]+(?:at|for)[ ]+(\b[A-Z][A-Za-z]+(?:[ ]+[A-Z][A-Za-z]+)*)\b/,
         # "Paula joined Microsoft"
-        /(\b[A-Z][a-z]+(?:\s+[A-Z][a-z]+)*)\s+(?:joined|started at|was hired by)\s+(\b[A-Z][A-Za-z]+(?:\s+[A-Z][A-Za-z]+)*)/,
+        /(\b[A-Z][a-z]+(?:[ ]+[A-Z][a-z]+)*)\b[ ]+(?:joined|started at|was hired by)[ ]+(\b[A-Z][A-Za-z]+(?:[ ]+[A-Z][A-Za-z]+)*)\b/,
         # "Paula left Microsoft"
-        /(\b[A-Z][a-z]+(?:\s+[A-Z][a-z]+)*)\s+(?:left|departed|resigned from|was fired from)\s+(\b[A-Z][A-Za-z]+(?:\s+[A-Z][A-Za-z]+)*)/,
+        /(\b[A-Z][a-z]+(?:[ ]+[A-Z][a-z]+)*)\b[ ]+(?:left|departed|resigned from|was fired from)[ ]+(\b[A-Z][A-Za-z]+(?:[ ]+[A-Z][A-Za-z]+)*)\b/,
         # "Paula is a Principal Engineer at Microsoft"
-        /(\b[A-Z][a-z]+(?:\s+[A-Z][a-z]+)*)\s+(?:is|was|became)\s+(?:a\s+)?([A-Z][A-Za-z]+(?:\s+[A-Z][A-Za-z]+)*)\s+at\s+(\b[A-Z][A-Za-z]+(?:\s+[A-Z][A-Za-z]+)*)/
+        /(\b[A-Z][a-z]+(?:[ ]+[A-Z][a-z]+)*)\b[ ]+(?:is|was|became)[ ]+(?:a[ ]+)?([A-Z][A-Za-z]+(?:[ ]+[A-Z][A-Za-z]+)*)[ ]+at[ ]+(\b[A-Z][A-Za-z]+(?:[ ]+[A-Z][A-Za-z]+)*)\b/
       ].freeze
-      # Relationship patterns
+      # @return [Array<Regexp>] patterns for relationship facts
       RELATIONSHIP_PATTERNS = [
         # "Paula is married to John"
-        /(\b[A-Z][a-z]+(?:\s+[A-Z][a-z]+)*)\s+(?:is|was)\s+(?:married to|engaged to|dating)\s+(\b[A-Z][a-z]+(?:\s+[A-Z][a-z]+)*)/,
+        /(\b[A-Z][a-z]+(?:[ ]+[A-Z][a-z]+)*)\b[ ]+(?:is|was)[ ]+(?:married to|engaged to|dating)[ ]+(\b[A-Z][a-z]+(?:[ ]+[A-Z][a-z]+)*)\b/,
         # "Paula is the CEO of Microsoft"
-        /(\b[A-Z][a-z]+(?:\s+[A-Z][a-z]+)*)\s+(?:is|was)\s+(?:the\s+)?(\w+(?:\s+\w+)*)\s+of\s+(\b[A-Z][A-Za-z]+(?:\s+[A-Z][A-Za-z]+)*)/
+        /(\b[A-Z][a-z]+(?:[ ]+[A-Z][a-z]+)*)\b[ ]+(?:is|was)[ ]+(?:the[ ]+)?(\w+(?:[ ]+\w+)*)[ ]+of[ ]+(\b[A-Z][A-Za-z]+(?:[ ]+[A-Z][A-Za-z]+)*)\b/
       ].freeze
-      # Location patterns
+      # @return [Array<Regexp>] patterns for location facts
       LOCATION_PATTERNS = [
-        # "Paula lives in Seattle"
-        /(\b[A-Z][a-z]+(?:\s+[A-Z][a-z]+)*)\s+(?:lives?|lived|is based|was based|relocated)\s+(?:in|to)\s+(\b[A-Z][A-Za-z]+(?:,?\s+[A-Z]{2})?)/,
-        # "Microsoft is headquartered in Redmond"
-        /(\b[A-Z][A-Za-z]+(?:\s+[A-Z][A-Za-z]+)*)\s+(?:is|was)\s+(?:headquartered|located|based)\s+in\s+(\b[A-Z][A-Za-z]+(?:,?\s+[A-Z]{2})?)/
+        # "Paula lives in Seattle" or "Bob lives in New York City"
+        /(\b[A-Z][a-z]+(?:[ ]+[A-Z][a-z]+)*)\b[ ]+(?:lives?|lived|is based|was based|relocated|moved)[ ]+(?:in|to)[ ]+(\b[A-Z][A-Za-z]+(?:[ ]+[A-Z][A-Za-z]+)*(?:,[ ]+[A-Z]{2})?)\b/,
+        # "Microsoft is headquartered in Redmond" or "in Seattle, Washington"
+        /(\b[A-Z][A-Za-z]+(?:[ ]+[A-Z][A-Za-z]+)*)\b[ ]+(?:is|was)[ ]+(?:headquartered|located|based)[ ]+in[ ]+(\b[A-Z][A-Za-z]+(?:[ ]+[A-Z][A-Za-z]+)*(?:,[ ]+[A-Z][A-Za-z]+)?)\b/
       ].freeze
+      # Extracts facts from text using regex patterns
+      #
+      # Applies employment, relationship, and location patterns to identify
+      # facts, with associated entity mentions and temporal information.
+      #
+      # @param text [String] raw text to extract from
+      # @param context [Hash] additional context
+      # @option context [Date, Time] :captured_at default timestamp for facts
+      # @return [Array<Hash>] array of fact hashes, deduplicated by text
       def extract(text, context = {})
         return [] if text.nil? || text.strip.empty?
@@ -66,23 +86,35 @@ module FactDb
         facts.uniq { |f| f[:text] }
       end
+      # Extracts entities from text using regex patterns
+      #
+      # Identifies person names, organization names, and locations using
+      # pattern matching. Filters out common words, job titles, and known phrases.
+      #
+      # @param text [String] raw text to extract from
+      # @return [Array<Hash>] array of entity hashes with :name and :kind
       def extract_entities(text)
         return [] if text.nil? || text.strip.empty?
         entities = []
-        # Extract person names (simple capitalized word sequences)
-        text.scan(/\b([A-Z][a-z]+(?:\s+[A-Z][a-z]+)+)\b/).flatten.uniq.each do |name|
+        # Extract person names (capitalized word sequences on same line)
+        # Use [ ]+ instead of \s+ to avoid matching across newlines
+        text.scan(/\b([A-Z][a-z]+(?:[ ]+[A-Z][a-z]+)+)\b/).flatten.uniq.each do |name|
           next if common_word?(name)
+          next if job_title?(name)
+          next if common_phrase?(name)
+          next if known_place?(name)
+          next if organization_indicator?(name)
-          entities << build_entity(name: name, type: "person")
+          entities << build_entity(name: name, kind: "person")
         end
         # Extract organization names (from employment patterns)
         EMPLOYMENT_PATTERNS.each do |pattern|
           text.scan(pattern).each do |match|
             org_name = match.last
-            entities << build_entity(name: org_name, type: "organization") unless common_word?(org_name)
+            entities << build_entity(name: org_name, kind: "organization") unless common_word?(org_name)
           end
         end
@@ -90,7 +122,7 @@ module FactDb
         LOCATION_PATTERNS.each do |pattern|
           text.scan(pattern).each do |match|
             location = match.last
-            entities << build_entity(name: location, type: "place") unless common_word?(location)
+            entities << build_entity(name: location, kind: "place") unless common_word?(location)
           end
         end
@@ -116,13 +148,13 @@ module FactDb
             invalid_at = is_termination ? (extract_end_date(text) || default_date) : nil
             mentions = [
-              build_mention(name: person, type: "person", role: "subject"),
-              build_mention(name: org, type: "organization", role: "object")
+              build_mention(name: person, kind: "person", role: "subject"),
+              build_mention(name: org, kind: "organization", role: "object")
             ]
             # Add role if present
             if rest.length > 1
-              mentions << build_mention(name: rest[0], type: "concept", role: "instrument")
+              mentions << build_mention(name: rest[0], kind: "concept", role: "instrument")
             end
             facts << build_fact(
@@ -148,7 +180,7 @@ module FactDb
             mentions = match.map.with_index do |name, i|
               role = i.zero? ? "subject" : "object"
-              build_mention(name: name, type: "person", role: role)
+              build_mention(name: name, kind: "person", role: role)
             end
             facts << build_fact(
@@ -177,8 +209,8 @@ module FactDb
             entity_type = text.match?(/#{Regexp.escape(entity_name)}\s+(?:lives?|lived)/i) ? "person" : "organization"
             mentions = [
-              build_mention(name: entity_name, type: entity_type, role: "subject"),
-              build_mention(name: location, type: "place", role: "location")
+              build_mention(name: entity_name, kind: entity_type, role: "subject"),
+              build_mention(name: location, kind: "place", role: "location")
             ]
             facts << build_fact(
@@ -223,6 +255,85 @@ module FactDb
         ]
         common_words.any? { |w| w.casecmp?(word) }
       end
+      def job_title?(text)
+        # Common job title words that indicate this is a role, not a person name
+        title_indicators = %w[
+          Chief Executive Officer Director Manager Engineer Developer
+          President Vice Principal Senior Junior Lead Head
+          Analyst Coordinator Administrator Assistant Specialist
+          Consultant Architect Designer Technician Supervisor
+          CTO CEO CFO COO CMO CIO CPO
+          VP SVP EVP
+        ]
+        words = text.split(/\s+/)
+        # If any word is a title indicator, it's likely a job title
+        words.any? { |word| title_indicators.any? { |t| t.casecmp?(word) } }
+      end
+      def common_phrase?(text)
+        # Common document phrases that are not person names
+        phrases = [
+          /Team\s+Updates?/i,
+          /Action\s+Items?/i,
+          /Meeting\s+Notes?/i,
+          /Status\s+Meeting/i,
+          /Project\s+Status/i,
+          /Human\s+Resources?/i,
+          /Best\s+Regards?/i,
+          /Immediate\s+Release/i,
+          /New\s+Leadership/i,
+          /Appoints?\s+New/i,
+          /Recent\s+\w+/i,
+          /Please\s+\w+/i
+        ]
+        phrases.any? { |pattern| text.match?(pattern) }
+      end
+      def known_place?(text)
+        # Common city/place names or location indicators
+        place_indicators = %w[
+          City County State Province District Region
+          Beach Park Heights Hills Valley Springs Lake
+          Island Harbor Port
+        ]
+        # Common multi-word US city names
+        known_cities = [
+          "New York", "Los Angeles", "San Francisco", "San Diego", "San Jose",
+          "San Antonio", "Las Vegas", "Salt Lake", "New Orleans", "Fort Worth",
+          "Fort Lauderdale", "St Louis", "St Paul", "El Paso", "Santa Fe",
+          "Santa Monica", "Palm Beach", "Long Beach", "Virginia Beach"
+        ]
+        words = text.split(/\s+/)
+        # Check for place indicator words
+        return true if words.any? { |word| place_indicators.any? { |p| p.casecmp?(word) } }
+        # Check for known city names
+        known_cities.any? { |city| text.casecmp?(city) || text.start_with?("#{city} ") }
+      end
+      def organization_indicator?(text)
+        # Words that indicate an organization, not a person
+        org_indicators = %w[
+          Solutions Technologies Systems Services Group
+          Partners Associates Consulting Agency
+          Industries Enterprises Holdings Ventures
+          Foundation Institute University College
+          Global International National Regional
+          Tech Corp Labs
+        ]
+        words = text.split(/\s+/)
+        # If any word is an org indicator, it's likely an organization
+        words.any? { |word| org_indicators.any? { |o| o.casecmp?(word) } }
+      end
     end
   end
 end

data/lib/fact_db/llm/adapter.rb CHANGED Viewed

@@ -29,9 +29,9 @@ module FactDb
     #   # llm_api_key: <%= ENV["ANTHROPIC_API_KEY"] %>
     #
     # @example Configure via environment variables
-    #   # EVENT_CLOCK_LLM_PROVIDER=anthropic
-    #   # EVENT_CLOCK_LLM_MODEL=claude-sonnet-4-20250514
-    #   # EVENT_CLOCK_LLM_API_KEY=sk-...
+    #   # FACT_DB_LLM_PROVIDER=anthropic
+    #   # FACT_DB_LLM_MODEL=claude-sonnet-4-20250514
+    #   # FACT_DB_LLM_API_KEY=sk-...
     #
     class Adapter
       attr_reader :model, :provider