fact_db 0.0.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (89) hide show
  1. checksums.yaml +7 -0
  2. data/.envrc +1 -0
  3. data/CHANGELOG.md +48 -0
  4. data/COMMITS.md +196 -0
  5. data/README.md +102 -0
  6. data/Rakefile +41 -0
  7. data/db/migrate/001_enable_extensions.rb +7 -0
  8. data/db/migrate/002_create_contents.rb +44 -0
  9. data/db/migrate/003_create_entities.rb +36 -0
  10. data/db/migrate/004_create_entity_aliases.rb +18 -0
  11. data/db/migrate/005_create_facts.rb +65 -0
  12. data/db/migrate/006_create_entity_mentions.rb +18 -0
  13. data/db/migrate/007_create_fact_sources.rb +18 -0
  14. data/docs/api/extractors/index.md +71 -0
  15. data/docs/api/extractors/llm.md +162 -0
  16. data/docs/api/extractors/manual.md +92 -0
  17. data/docs/api/extractors/rule-based.md +165 -0
  18. data/docs/api/facts.md +300 -0
  19. data/docs/api/index.md +66 -0
  20. data/docs/api/models/content.md +165 -0
  21. data/docs/api/models/entity.md +202 -0
  22. data/docs/api/models/fact.md +270 -0
  23. data/docs/api/models/index.md +77 -0
  24. data/docs/api/pipeline/extraction.md +175 -0
  25. data/docs/api/pipeline/index.md +72 -0
  26. data/docs/api/pipeline/resolution.md +209 -0
  27. data/docs/api/services/content-service.md +166 -0
  28. data/docs/api/services/entity-service.md +202 -0
  29. data/docs/api/services/fact-service.md +223 -0
  30. data/docs/api/services/index.md +55 -0
  31. data/docs/architecture/database-schema.md +293 -0
  32. data/docs/architecture/entity-resolution.md +293 -0
  33. data/docs/architecture/index.md +149 -0
  34. data/docs/architecture/temporal-facts.md +268 -0
  35. data/docs/architecture/three-layer-model.md +242 -0
  36. data/docs/assets/css/custom.css +137 -0
  37. data/docs/assets/fact_db.jpg +0 -0
  38. data/docs/assets/images/fact_db.jpg +0 -0
  39. data/docs/concepts.md +183 -0
  40. data/docs/examples/basic-usage.md +235 -0
  41. data/docs/examples/hr-onboarding.md +312 -0
  42. data/docs/examples/index.md +64 -0
  43. data/docs/examples/news-analysis.md +288 -0
  44. data/docs/getting-started/database-setup.md +170 -0
  45. data/docs/getting-started/index.md +71 -0
  46. data/docs/getting-started/installation.md +98 -0
  47. data/docs/getting-started/quick-start.md +191 -0
  48. data/docs/guides/batch-processing.md +325 -0
  49. data/docs/guides/configuration.md +243 -0
  50. data/docs/guides/entity-management.md +364 -0
  51. data/docs/guides/extracting-facts.md +299 -0
  52. data/docs/guides/index.md +22 -0
  53. data/docs/guides/ingesting-content.md +252 -0
  54. data/docs/guides/llm-integration.md +299 -0
  55. data/docs/guides/temporal-queries.md +315 -0
  56. data/docs/index.md +121 -0
  57. data/examples/README.md +130 -0
  58. data/examples/basic_usage.rb +164 -0
  59. data/examples/entity_management.rb +216 -0
  60. data/examples/hr_system.rb +428 -0
  61. data/examples/rule_based_extraction.rb +258 -0
  62. data/examples/temporal_queries.rb +245 -0
  63. data/lib/fact_db/config.rb +71 -0
  64. data/lib/fact_db/database.rb +45 -0
  65. data/lib/fact_db/errors.rb +10 -0
  66. data/lib/fact_db/extractors/base.rb +117 -0
  67. data/lib/fact_db/extractors/llm_extractor.rb +179 -0
  68. data/lib/fact_db/extractors/manual_extractor.rb +53 -0
  69. data/lib/fact_db/extractors/rule_based_extractor.rb +228 -0
  70. data/lib/fact_db/llm/adapter.rb +109 -0
  71. data/lib/fact_db/models/content.rb +62 -0
  72. data/lib/fact_db/models/entity.rb +84 -0
  73. data/lib/fact_db/models/entity_alias.rb +26 -0
  74. data/lib/fact_db/models/entity_mention.rb +33 -0
  75. data/lib/fact_db/models/fact.rb +192 -0
  76. data/lib/fact_db/models/fact_source.rb +35 -0
  77. data/lib/fact_db/pipeline/extraction_pipeline.rb +146 -0
  78. data/lib/fact_db/pipeline/resolution_pipeline.rb +129 -0
  79. data/lib/fact_db/resolution/entity_resolver.rb +261 -0
  80. data/lib/fact_db/resolution/fact_resolver.rb +259 -0
  81. data/lib/fact_db/services/content_service.rb +93 -0
  82. data/lib/fact_db/services/entity_service.rb +150 -0
  83. data/lib/fact_db/services/fact_service.rb +193 -0
  84. data/lib/fact_db/temporal/query.rb +125 -0
  85. data/lib/fact_db/temporal/timeline.rb +134 -0
  86. data/lib/fact_db/version.rb +5 -0
  87. data/lib/fact_db.rb +141 -0
  88. data/mkdocs.yml +198 -0
  89. metadata +288 -0
@@ -0,0 +1,170 @@
1
+ # Database Setup
2
+
3
+ FactDb uses PostgreSQL with the pgvector extension for storing content, entities, and facts with semantic search capabilities.
4
+
5
+ ## Create Database
6
+
7
+ ```bash
8
+ createdb fact_db
9
+ ```
10
+
11
+ ## Enable pgvector
12
+
13
+ Connect to your database and enable the extension:
14
+
15
+ ```sql
16
+ CREATE EXTENSION IF NOT EXISTS vector;
17
+ ```
18
+
19
+ ## Run Migrations
20
+
21
+ FactDb provides migrations that create all necessary tables:
22
+
23
+ ```ruby
24
+ require 'fact_db'
25
+
26
+ FactDb.configure do |config|
27
+ config.database_url = "postgresql://localhost/fact_db"
28
+ end
29
+
30
+ FactDb::Database.migrate!
31
+ ```
32
+
33
+ ## Schema Overview
34
+
35
+ The migrations create six tables:
36
+
37
+ ### contents
38
+
39
+ Stores immutable source documents.
40
+
41
+ | Column | Type | Description |
42
+ |--------|------|-------------|
43
+ | id | bigint | Primary key |
44
+ | content_hash | string | SHA256 hash for deduplication |
45
+ | content_type | string | Type (email, document, article) |
46
+ | raw_text | text | Original content |
47
+ | title | string | Optional title |
48
+ | source_uri | string | Original location |
49
+ | source_metadata | jsonb | Additional metadata |
50
+ | embedding | vector(1536) | Semantic search vector |
51
+ | captured_at | timestamptz | When content was captured |
52
+
53
+ ### entities
54
+
55
+ Stores resolved identities.
56
+
57
+ | Column | Type | Description |
58
+ |--------|------|-------------|
59
+ | id | bigint | Primary key |
60
+ | canonical_name | string | Authoritative name |
61
+ | entity_type | string | person, organization, place, etc. |
62
+ | resolution_status | string | unresolved, resolved, merged |
63
+ | merged_into_id | bigint | Points to canonical entity if merged |
64
+ | metadata | jsonb | Additional attributes |
65
+ | embedding | vector(1536) | Semantic search vector |
66
+
67
+ ### entity_aliases
68
+
69
+ Stores alternative names for entities.
70
+
71
+ | Column | Type | Description |
72
+ |--------|------|-------------|
73
+ | id | bigint | Primary key |
74
+ | entity_id | bigint | Foreign key to entities |
75
+ | alias_text | string | Alternative name |
76
+ | alias_type | string | nickname, abbreviation, etc. |
77
+ | confidence | float | Match confidence (0-1) |
78
+
79
+ ### facts
80
+
81
+ Stores temporal assertions.
82
+
83
+ | Column | Type | Description |
84
+ |--------|------|-------------|
85
+ | id | bigint | Primary key |
86
+ | fact_text | text | The assertion |
87
+ | fact_hash | string | For deduplication |
88
+ | valid_at | timestamptz | When fact became true |
89
+ | invalid_at | timestamptz | When fact stopped being true |
90
+ | status | string | canonical, superseded, corroborated, synthesized |
91
+ | superseded_by_id | bigint | Points to replacing fact |
92
+ | derived_from_ids | bigint[] | Source facts for synthesized |
93
+ | corroborated_by_ids | bigint[] | Corroborating facts |
94
+ | confidence | float | Extraction confidence |
95
+ | extraction_method | string | manual, llm, rule_based |
96
+ | metadata | jsonb | Additional data |
97
+ | embedding | vector(1536) | Semantic search vector |
98
+
99
+ ### entity_mentions
100
+
101
+ Links facts to entities.
102
+
103
+ | Column | Type | Description |
104
+ |--------|------|-------------|
105
+ | id | bigint | Primary key |
106
+ | fact_id | bigint | Foreign key to facts |
107
+ | entity_id | bigint | Foreign key to entities |
108
+ | mention_text | string | Text that mentioned entity |
109
+ | mention_role | string | subject, object, location, etc. |
110
+ | confidence | float | Resolution confidence |
111
+
112
+ ### fact_sources
113
+
114
+ Links facts to source content.
115
+
116
+ | Column | Type | Description |
117
+ |--------|------|-------------|
118
+ | id | bigint | Primary key |
119
+ | fact_id | bigint | Foreign key to facts |
120
+ | content_id | bigint | Foreign key to contents |
121
+ | source_type | string | primary, supporting, contradicting |
122
+ | excerpt | text | Relevant text excerpt |
123
+ | confidence | float | Source confidence |
124
+
125
+ ## Indexes
126
+
127
+ The migrations create indexes for:
128
+
129
+ - Content hash (unique)
130
+ - Content type
131
+ - Full-text search on raw_text
132
+ - Entity canonical name
133
+ - Entity type
134
+ - Fact status
135
+ - Temporal range queries (valid_at, invalid_at)
136
+ - HNSW indexes for vector similarity search
137
+
138
+ ## Custom Migration
139
+
140
+ If you need to integrate with an existing database or customize the schema:
141
+
142
+ ```ruby
143
+ # Copy migration files to your project
144
+ FileUtils.cp_r(
145
+ FactDb.root.join('db/migrate'),
146
+ Rails.root.join('db/migrate')
147
+ )
148
+
149
+ # Or run standalone
150
+ FactDb::Database.migrate!(
151
+ migrations_path: '/custom/path/to/migrations'
152
+ )
153
+ ```
154
+
155
+ ## Connection Pool
156
+
157
+ Configure the connection pool for your workload:
158
+
159
+ ```ruby
160
+ FactDb.configure do |config|
161
+ config.database_url = ENV['DATABASE_URL']
162
+ config.database_pool_size = 10 # Default: 5
163
+ config.database_timeout = 60_000 # Default: 30000ms
164
+ end
165
+ ```
166
+
167
+ ## Next Steps
168
+
169
+ - [Quick Start](quick-start.md) - Start using FactDb
170
+ - [Configuration](../guides/configuration.md) - Full configuration options
@@ -0,0 +1,71 @@
1
+ # Getting Started
2
+
3
+ This section will help you get FactDb up and running in your Ruby application.
4
+
5
+ ## Prerequisites
6
+
7
+ Before installing FactDb, ensure you have:
8
+
9
+ - **Ruby 3.0+** - FactDb requires Ruby 3.0 or later
10
+ - **PostgreSQL 14+** - With the pgvector extension installed
11
+ - **Bundler** - For dependency management
12
+
13
+ ## Quick Navigation
14
+
15
+ <div class="grid cards" markdown>
16
+
17
+ - :material-download:{ .lg .middle } **Installation**
18
+
19
+ ---
20
+
21
+ Install FactDb and its dependencies
22
+
23
+ [:octicons-arrow-right-24: Installation Guide](installation.md)
24
+
25
+ - :material-rocket-launch:{ .lg .middle } **Quick Start**
26
+
27
+ ---
28
+
29
+ Get up and running in 5 minutes
30
+
31
+ [:octicons-arrow-right-24: Quick Start](quick-start.md)
32
+
33
+ - :material-database:{ .lg .middle } **Database Setup**
34
+
35
+ ---
36
+
37
+ Configure PostgreSQL and run migrations
38
+
39
+ [:octicons-arrow-right-24: Database Setup](database-setup.md)
40
+
41
+ </div>
42
+
43
+ ## Overview
44
+
45
+ Getting started with FactDb involves three steps:
46
+
47
+ 1. **Install the gem** - Add FactDb to your Gemfile
48
+ 2. **Set up the database** - Create tables and enable pgvector
49
+ 3. **Configure** - Set database URL and optional LLM settings
50
+
51
+ Once configured, you can start ingesting content and extracting facts:
52
+
53
+ ```ruby
54
+ require 'fact_db'
55
+
56
+ # Configure
57
+ FactDb.configure do |config|
58
+ config.database_url = ENV['DATABASE_URL']
59
+ end
60
+
61
+ # Create a facts instance
62
+ facts = FactDb.new
63
+
64
+ # Ingest content
65
+ content = facts.ingest("Important information...", type: :document)
66
+
67
+ # Extract and query facts
68
+ extracted = facts.extract_facts(content.id)
69
+ ```
70
+
71
+ Continue to the [Installation Guide](installation.md) to begin.
@@ -0,0 +1,98 @@
1
+ # Installation
2
+
3
+ ## Requirements
4
+
5
+ - Ruby >= 3.0.0
6
+ - PostgreSQL >= 14 with pgvector extension
7
+ - Bundler
8
+
9
+ ## Install the Gem
10
+
11
+ Add FactDb to your Gemfile:
12
+
13
+ ```ruby
14
+ gem 'fact_db'
15
+ ```
16
+
17
+ Then install:
18
+
19
+ ```bash
20
+ bundle install
21
+ ```
22
+
23
+ Or install directly:
24
+
25
+ ```bash
26
+ gem install fact_db
27
+ ```
28
+
29
+ ## Install pgvector
30
+
31
+ FactDb uses pgvector for semantic search. Install the PostgreSQL extension:
32
+
33
+ === "macOS (Homebrew)"
34
+
35
+ ```bash
36
+ brew install pgvector
37
+ ```
38
+
39
+ === "Ubuntu/Debian"
40
+
41
+ ```bash
42
+ sudo apt install postgresql-14-pgvector
43
+ ```
44
+
45
+ === "From Source"
46
+
47
+ ```bash
48
+ git clone https://github.com/pgvector/pgvector.git
49
+ cd pgvector
50
+ make
51
+ sudo make install
52
+ ```
53
+
54
+ Then enable the extension in your database:
55
+
56
+ ```sql
57
+ CREATE EXTENSION IF NOT EXISTS vector;
58
+ ```
59
+
60
+ ## Optional Dependencies
61
+
62
+ ### LLM Extraction
63
+
64
+ For LLM-powered fact extraction, add the ruby_llm gem:
65
+
66
+ ```ruby
67
+ gem 'ruby_llm'
68
+ ```
69
+
70
+ ### Async Processing
71
+
72
+ For parallel pipeline processing with async fibers:
73
+
74
+ ```ruby
75
+ gem 'async', '~> 2.0'
76
+ ```
77
+
78
+ ## Verify Installation
79
+
80
+ Create a simple test script:
81
+
82
+ ```ruby
83
+ require 'fact_db'
84
+
85
+ puts "FactDb version: #{FactDb::VERSION}"
86
+ puts "Installation successful!"
87
+ ```
88
+
89
+ Run it:
90
+
91
+ ```bash
92
+ ruby test_install.rb
93
+ ```
94
+
95
+ ## Next Steps
96
+
97
+ - [Database Setup](database-setup.md) - Configure your database
98
+ - [Quick Start](quick-start.md) - Start using FactDb
@@ -0,0 +1,191 @@
1
+ # Quick Start
2
+
3
+ Get FactDb running in 5 minutes.
4
+
5
+ ## 1. Configure
6
+
7
+ Create a configuration file or use environment variables:
8
+
9
+ === "Environment Variables"
10
+
11
+ ```bash
12
+ export EVENT_CLOCK_DATABASE_URL="postgresql://localhost/fact_db"
13
+ export EVENT_CLOCK_LLM_PROVIDER="openai"
14
+ export EVENT_CLOCK_LLM_API_KEY="sk-..."
15
+ ```
16
+
17
+ === "YAML Config"
18
+
19
+ ```yaml
20
+ # config/fact_db.yml
21
+ database_url: postgresql://localhost/fact_db
22
+ llm_provider: openai
23
+ llm_api_key: <%= ENV['OPENAI_API_KEY'] %>
24
+ ```
25
+
26
+ === "Ruby Block"
27
+
28
+ ```ruby
29
+ FactDb.configure do |config|
30
+ config.database_url = "postgresql://localhost/fact_db"
31
+ config.llm_provider = :openai
32
+ config.llm_api_key = ENV['OPENAI_API_KEY']
33
+ end
34
+ ```
35
+
36
+ ## 2. Set Up Database
37
+
38
+ Run the migrations:
39
+
40
+ ```ruby
41
+ require 'fact_db'
42
+
43
+ FactDb.configure do |config|
44
+ config.database_url = ENV['DATABASE_URL']
45
+ end
46
+
47
+ # Run migrations
48
+ FactDb::Database.migrate!
49
+ ```
50
+
51
+ ## 3. Create Your First Facts Instance
52
+
53
+ ```ruby
54
+ require 'fact_db'
55
+
56
+ facts = FactDb.new
57
+ ```
58
+
59
+ ## 4. Ingest Content
60
+
61
+ ```ruby
62
+ # Ingest an email
63
+ content = facts.ingest(
64
+ "Hi team, Paula Chen has accepted our offer and will join as Principal Engineer starting January 10, 2024. She'll be reporting to Sarah in the Platform team.",
65
+ type: :email,
66
+ title: "New Hire Announcement",
67
+ captured_at: Time.current
68
+ )
69
+
70
+ puts "Ingested content: #{content.id}"
71
+ ```
72
+
73
+ ## 5. Create Entities
74
+
75
+ ```ruby
76
+ # Create entities for people and organizations
77
+ paula = facts.entity_service.create(
78
+ "Paula Chen",
79
+ type: :person,
80
+ aliases: ["Paula", "P. Chen"]
81
+ )
82
+
83
+ sarah = facts.entity_service.create(
84
+ "Sarah Johnson",
85
+ type: :person,
86
+ aliases: ["Sarah"]
87
+ )
88
+
89
+ platform_team = facts.entity_service.create(
90
+ "Platform Team",
91
+ type: :organization
92
+ )
93
+ ```
94
+
95
+ ## 6. Extract Facts
96
+
97
+ ### Manual Extraction
98
+
99
+ ```ruby
100
+ fact = facts.fact_service.create(
101
+ "Paula Chen joined as Principal Engineer",
102
+ valid_at: Date.parse("2024-01-10"),
103
+ mentions: [
104
+ { entity: paula, role: "subject", text: "Paula Chen" }
105
+ ],
106
+ sources: [
107
+ { content: content, type: "primary" }
108
+ ]
109
+ )
110
+ ```
111
+
112
+ ### LLM Extraction
113
+
114
+ ```ruby
115
+ # Extract facts automatically using LLM
116
+ extracted = facts.extract_facts(content.id, extractor: :llm)
117
+
118
+ extracted.each do |fact|
119
+ puts "Extracted: #{fact.fact_text}"
120
+ puts " Valid from: #{fact.valid_at}"
121
+ end
122
+ ```
123
+
124
+ ## 7. Query Facts
125
+
126
+ ```ruby
127
+ # Get current facts about Paula
128
+ current = facts.current_facts_for(paula.id)
129
+ current.each { |f| puts f.fact_text }
130
+
131
+ # Get facts valid at a specific date
132
+ historical = facts.facts_at(
133
+ Date.parse("2023-12-01"),
134
+ entity: paula.id
135
+ )
136
+
137
+ # Search by topic
138
+ team_facts = facts.query_facts(topic: "Platform Team")
139
+ ```
140
+
141
+ ## 8. Build Timelines
142
+
143
+ ```ruby
144
+ timeline = facts.timeline_for(paula.id)
145
+
146
+ timeline.each do |entry|
147
+ puts "#{entry[:date]}: #{entry[:fact].fact_text}"
148
+ end
149
+ ```
150
+
151
+ ## Complete Example
152
+
153
+ ```ruby
154
+ require 'fact_db'
155
+
156
+ # Configure
157
+ FactDb.configure do |config|
158
+ config.database_url = ENV['DATABASE_URL']
159
+ config.llm_provider = :openai
160
+ config.llm_api_key = ENV['OPENAI_API_KEY']
161
+ end
162
+
163
+ # Create facts instance
164
+ facts = FactDb.new
165
+
166
+ # Ingest content
167
+ content = facts.ingest(
168
+ "Paula Chen joined Microsoft as Principal Engineer on January 10, 2024.",
169
+ type: :announcement,
170
+ captured_at: Time.current
171
+ )
172
+
173
+ # Create entities
174
+ paula = facts.entity_service.create("Paula Chen", type: :person)
175
+ microsoft = facts.entity_service.create("Microsoft", type: :organization)
176
+
177
+ # Extract facts via LLM
178
+ extracted = facts.extract_facts(content.id, extractor: :llm)
179
+
180
+ # Query
181
+ puts "Current facts about Paula:"
182
+ facts.current_facts_for(paula.id).each do |fact|
183
+ puts " - #{fact.fact_text}"
184
+ end
185
+ ```
186
+
187
+ ## Next Steps
188
+
189
+ - [Configuration Guide](../guides/configuration.md) - Detailed configuration options
190
+ - [Ingesting Content](../guides/ingesting-content.md) - Learn about content types
191
+ - [LLM Integration](../guides/llm-integration.md) - Set up LLM providers