fact_db 0.0.2 → 0.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (115) hide show
  1. checksums.yaml +4 -4
  2. data/.envrc +2 -0
  3. data/.yardopts +5 -0
  4. data/CHANGELOG.md +103 -0
  5. data/README.md +107 -6
  6. data/Rakefile +253 -10
  7. data/db/migrate/001_enable_extensions.rb +1 -0
  8. data/db/migrate/002_create_sources.rb +49 -0
  9. data/db/migrate/003_create_entities.rb +27 -15
  10. data/db/migrate/004_create_entity_aliases.rb +20 -7
  11. data/db/migrate/005_create_facts.rb +37 -21
  12. data/db/migrate/006_create_entity_mentions.rb +14 -6
  13. data/db/migrate/007_create_fact_sources.rb +16 -8
  14. data/db/migrate/008_add_persistent_tsvectors.rb +97 -0
  15. data/docs/api/extractors/index.md +5 -5
  16. data/docs/api/extractors/llm.md +17 -17
  17. data/docs/api/extractors/rule-based.md +14 -14
  18. data/docs/api/facts.md +20 -20
  19. data/docs/api/index.md +4 -4
  20. data/docs/api/models/entity.md +21 -21
  21. data/docs/api/models/fact.md +15 -15
  22. data/docs/api/models/index.md +7 -7
  23. data/docs/api/models/{content.md → source.md} +29 -29
  24. data/docs/api/pipeline/extraction.md +25 -25
  25. data/docs/api/pipeline/index.md +1 -1
  26. data/docs/api/pipeline/resolution.md +4 -4
  27. data/docs/api/services/entity-service.md +20 -20
  28. data/docs/api/services/fact-service.md +12 -12
  29. data/docs/api/services/index.md +5 -5
  30. data/docs/api/services/{content-service.md → source-service.md} +27 -27
  31. data/docs/architecture/database-schema.md +62 -46
  32. data/docs/architecture/entity-resolution.md +6 -6
  33. data/docs/architecture/index.md +10 -10
  34. data/docs/architecture/temporal-facts.md +5 -5
  35. data/docs/architecture/three-layer-model.md +17 -17
  36. data/docs/concepts.md +6 -6
  37. data/docs/examples/basic-usage.md +20 -20
  38. data/docs/examples/hr-onboarding.md +17 -17
  39. data/docs/examples/index.md +4 -4
  40. data/docs/examples/news-analysis.md +23 -23
  41. data/docs/getting-started/database-setup.md +28 -20
  42. data/docs/getting-started/index.md +3 -3
  43. data/docs/getting-started/quick-start.md +33 -30
  44. data/docs/guides/batch-processing.md +26 -26
  45. data/docs/guides/configuration.md +158 -77
  46. data/docs/guides/entity-management.md +14 -14
  47. data/docs/guides/extracting-facts.md +28 -28
  48. data/docs/guides/ingesting-content.md +14 -14
  49. data/docs/guides/llm-integration.md +40 -32
  50. data/docs/guides/temporal-queries.md +11 -11
  51. data/docs/index.md +6 -2
  52. data/examples/.envrc +4 -0
  53. data/examples/.gitignore +1 -0
  54. data/examples/001_configuration.rb +345 -0
  55. data/examples/{basic_usage.rb → 010_basic_usage.rb} +47 -56
  56. data/examples/{entity_management.rb → 020_entity_management.rb} +57 -72
  57. data/examples/{temporal_queries.rb → 030_temporal_queries.rb} +39 -59
  58. data/examples/040_output_formats.rb +177 -0
  59. data/examples/{rule_based_extraction.rb → 050_rule_based_extraction.rb} +39 -45
  60. data/examples/060_fluent_temporal_api.rb +217 -0
  61. data/examples/070_introspection.rb +252 -0
  62. data/examples/{hr_system.rb → 080_hr_system.rb} +56 -75
  63. data/examples/090_ingest_demo.rb +515 -0
  64. data/examples/100_query_context.rb +668 -0
  65. data/examples/110_prove_it.rb +204 -0
  66. data/examples/120_dump_database.rb +358 -0
  67. data/examples/130_rag_feedback_loop.rb +858 -0
  68. data/examples/README.md +229 -15
  69. data/examples/data/lincoln_associates.md +201 -0
  70. data/examples/data/lincoln_biography.md +66 -0
  71. data/examples/data/lincoln_cabinet.md +243 -0
  72. data/examples/data/lincoln_family.md +163 -0
  73. data/examples/data/lincoln_military.md +241 -0
  74. data/examples/data/lincoln_todd_family.md +136 -0
  75. data/examples/ingest_reporter.rb +335 -0
  76. data/examples/utilities.rb +182 -0
  77. data/lib/fact_db/config/defaults.yml +254 -0
  78. data/lib/fact_db/config.rb +94 -35
  79. data/lib/fact_db/database.rb +98 -8
  80. data/lib/fact_db/extractors/base.rb +106 -21
  81. data/lib/fact_db/extractors/llm_extractor.rb +35 -63
  82. data/lib/fact_db/extractors/manual_extractor.rb +46 -6
  83. data/lib/fact_db/extractors/rule_based_extractor.rb +136 -25
  84. data/lib/fact_db/llm/adapter.rb +3 -3
  85. data/lib/fact_db/models/entity.rb +94 -22
  86. data/lib/fact_db/models/entity_alias.rb +41 -7
  87. data/lib/fact_db/models/entity_mention.rb +34 -1
  88. data/lib/fact_db/models/fact.rb +259 -28
  89. data/lib/fact_db/models/fact_source.rb +43 -9
  90. data/lib/fact_db/models/source.rb +113 -0
  91. data/lib/fact_db/pipeline/extraction_pipeline.rb +35 -35
  92. data/lib/fact_db/pipeline/resolution_pipeline.rb +5 -5
  93. data/lib/fact_db/query_result.rb +202 -0
  94. data/lib/fact_db/resolution/entity_resolver.rb +139 -39
  95. data/lib/fact_db/resolution/fact_resolver.rb +86 -14
  96. data/lib/fact_db/services/entity_service.rb +246 -37
  97. data/lib/fact_db/services/fact_service.rb +254 -17
  98. data/lib/fact_db/services/source_service.rb +164 -0
  99. data/lib/fact_db/temporal/query.rb +71 -7
  100. data/lib/fact_db/temporal/query_builder.rb +69 -0
  101. data/lib/fact_db/temporal/timeline.rb +102 -11
  102. data/lib/fact_db/transformers/base.rb +77 -0
  103. data/lib/fact_db/transformers/cypher_transformer.rb +185 -0
  104. data/lib/fact_db/transformers/json_transformer.rb +17 -0
  105. data/lib/fact_db/transformers/raw_transformer.rb +35 -0
  106. data/lib/fact_db/transformers/text_transformer.rb +114 -0
  107. data/lib/fact_db/transformers/triple_transformer.rb +138 -0
  108. data/lib/fact_db/validation/alias_filter.rb +185 -0
  109. data/lib/fact_db/version.rb +1 -1
  110. data/lib/fact_db.rb +281 -30
  111. data/mkdocs.yml +2 -2
  112. metadata +62 -17
  113. data/db/migrate/002_create_contents.rb +0 -44
  114. data/lib/fact_db/models/content.rb +0 -62
  115. data/lib/fact_db/services/content_service.rb +0 -93
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: a87c848e8b274c26d7960e6ac91c5efaf3238b568fb264e2c92e3d37548aa398
4
- data.tar.gz: 72d964331f79436f8efefc0627896709fe2a32487195da21a180504c15c67082
3
+ metadata.gz: ac11ce032c5e56849e6910c26154add65d81d3aaaa349969e584c493633e8db9
4
+ data.tar.gz: c0ace2cd5605d4530fc3f2a16e91a2edb85dd5532ccc80bdf3d0a2248a0e2a27
5
5
  SHA512:
6
- metadata.gz: 74c901b77d7081e53ff87dc81800cc4cb862b83de96ce1e8b15de2f46e0b4d7b8e7b91daaaaedaca9fa723c36da9b0142fa68c2a9583a1f52731b7642ed29245
7
- data.tar.gz: 6f7359db8aaaa3c60c8fba762d4c96157834f2427e4295e760b74671ca30dfa4340c2070078d287b9263d77be7e8f89cb6a6731792a4273cc3233d4bb11aba81
6
+ metadata.gz: e0ecf9c10caaa2e5836282bfac4b8de0b0f287948ca7f9de5413a8a00bb6d188e272b9a0f10d1832ca68806c9743ecdeac3939c304c6636c06d0b96847e33eb9
7
+ data.tar.gz: efd5e98fb240194a467bf6a8ddec659c754635290f54f99dab4c11779f627589fa659b8827feb9b0bd7e1c2d761e592e3c03fc30965d7a54095c32e05c48f310
data/.envrc CHANGED
@@ -1 +1,3 @@
1
1
  export RR=`pwd`
2
+
3
+ export FDB_ENV=development
data/.yardopts ADDED
@@ -0,0 +1,5 @@
1
+ --output-dir doc
2
+ --readme README.md
3
+ --markup markdown
4
+ --asset docs/assets:assets
5
+ lib/**/*.rb
data/CHANGELOG.md CHANGED
@@ -8,6 +8,109 @@ All notable changes to this project will be documented in this file.
8
8
  The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.1.0/),
9
9
  and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html).
10
10
 
11
+ ## [0.1.0] - Unreleased
12
+
13
+ ### Added
14
+
15
+ - **Persistent TSVectors** - Precomputed `tsvector` columns for full-text search
16
+ - `text_vector` column on `fact_db_facts` replaces on-the-fly `to_tsvector()` computation
17
+ - `content_vector` column on `fact_db_sources` replaces on-the-fly `to_tsvector()` computation
18
+ - Database triggers automatically keep vectors in sync on INSERT/UPDATE
19
+ - GIN indexes created `CONCURRENTLY` for non-blocking deployment
20
+ - Migration backfills existing rows and drops redundant expression-based indexes
21
+ - **Configurable LLM Prompts** - Extraction prompts moved to configuration
22
+ - `config.prompts.fact_extraction` - Customizable fact extraction prompt template
23
+ - `config.prompts.entity_extraction` - Customizable entity extraction prompt template
24
+ - `config.prompts.rag_system` - Customizable RAG system prompt
25
+ - Override via config files, environment variables (`FDB_PROMPTS__*`), or programmatic configuration
26
+ - **Configuration Defaults File** - `lib/fact_db/config/defaults.yml` as single source of truth for all config schema and defaults
27
+ - **Configuration Example** - New `examples/001_configuration.rb` demonstrating all configuration methods
28
+ - **Ingest Reporter** - New `examples/ingest_reporter.rb` with structured reporting for markdown ingestion
29
+
30
+ ### Changed
31
+
32
+ - **Full-text search scopes** now query persisted `tsvector` columns instead of computing them at query time
33
+ - `Fact.search_text` queries `text_vector` column directly
34
+ - `Source.search_text` queries `content_vector` column directly
35
+ - `ts_rank_cd()` calls in examples use persisted columns instead of recomputing
36
+ - **LLM Extractor** - Removed hardcoded prompt constants; prompts now loaded from configuration
37
+ - **Markdown ingestion** - Refactored with new reporter and reduced progress verbosity
38
+ - Version bump from 0.0.4 to 0.1.0
39
+
40
+ ## [0.0.4] - 2026-01-12
41
+
42
+ ### Added
43
+
44
+ - **LLM prompt configuration** - Prompts for fact/entity extraction are now configurable
45
+
46
+ ### Changed
47
+
48
+ - Version bump from 0.0.3 to 0.0.4
49
+
50
+ ## [0.0.3] - 2026-01-12
51
+
52
+ ### Added
53
+
54
+ - **Rake Tasks** - New database and documentation tasks
55
+ - `db:dump` - Dump database to file with timestamped naming convention
56
+ - `db:restore` - Restore database from dump file with interactive selection
57
+ - `db:schema:dump` - Dump database schema to `db/schema.sql`
58
+ - `db:schema:load` - Load database schema from `db/schema.sql`
59
+ - `docs:mkdocs` - Build MkDocs documentation site
60
+ - `docs:yard` - Build YARD API documentation
61
+ - `docs:all` - Build all documentation
62
+ - **YARD Documentation** - API documentation with GitHub Pages deployment
63
+ - Added `.yardopts` configuration
64
+ - Added GitHub Actions workflow for YARD deployment
65
+ - YARD docs available at `/yard` subdirectory on GitHub Pages
66
+ - **Trigram Search** - Added pg_trgm extension for fuzzy text matching
67
+ - **RAG Feedback Loop Example** - New example demonstrating retrieval-augmented generation patterns
68
+ - **Output Transformers** - Transform query results into multiple formats optimized for LLM consumption
69
+ - `RawTransformer` - Returns original ActiveRecord objects unchanged for direct database access
70
+ - `JsonTransformer` - JSON-serializable hash format (default)
71
+ - `TripleTransformer` - Subject-Predicate-Object triples for semantic encoding
72
+ - `CypherTransformer` - Cypher-like graph notation with nodes and relationships
73
+ - `TextTransformer` - Human-readable markdown format grouped by fact status
74
+ - **QueryResult** - Unified container for query results that works with all transformers
75
+ - Normalizes facts from ActiveRecord objects or hashes
76
+ - Resolves and caches entities referenced in facts
77
+ - Provides iteration methods (`each_fact`, `each_entity`)
78
+ - **Temporal Query Builder** - Fluent API for point-in-time queries via `facts.at(date)`
79
+ - Chain queries: `facts.at("2024-01-15").query("Paula's role", format: :cypher)`
80
+ - Get facts for entity: `facts.at("2024-01-15").facts_for(entity_id)`
81
+ - Compare dates: `facts.at("2024-01-15").compare_to("2024-06-15")`
82
+ - **Temporal Diff** - Compare what changed between two dates with `facts.diff(topic, from:, to:)`
83
+ - Returns `:added`, `:removed`, and `:unchanged` fact arrays
84
+ - **Introspection API** - Discover what the fact database knows about
85
+ - `facts.introspect` - Get schema, capabilities, entity types, and statistics
86
+ - `facts.introspect("Paula Chen")` - Get coverage and relationships for a topic
87
+ - `facts.suggest_queries(topic)` - Get suggested queries based on stored data
88
+ - `facts.suggest_strategies(query)` - Get recommended retrieval strategies
89
+ - **Format Parameter** - All query methods now accept `format:` parameter
90
+ - Available formats: `:raw`, `:json`, `:triples`, `:cypher`, `:text`
91
+ - Example: `facts.query_facts(topic: "Paula", format: :cypher)`
92
+
93
+ ### Changed
94
+
95
+ - **Configuration** - Replaced `anyway_config` with `myway_config` for configuration management
96
+ - Added environment-specific configuration support
97
+ - `EntityService` now includes `relationship_types_for(entity_id)` and `timespan_for(entity_id)` methods
98
+ - `FactService` now includes `fact_stats(entity_id)` for per-entity statistics
99
+
100
+ ### Breaking Changes
101
+
102
+ - **Database Schema Renames** - Multiple columns and tables renamed for consistency
103
+ - Table: `contents` → `sources`
104
+ - Column: `content_type` → `type` (in sources)
105
+ - Column: `source_metadata` → `metadata` (in sources)
106
+ - Column: `entity_type` → `type` (in entities)
107
+ - Column: `canonical_name` → `name` (in entities)
108
+ - Column: `merged_into_id` → `canonical_id` (in entities)
109
+ - Column: `alias_text` → `name` (in entity_aliases)
110
+ - Column: `alias_type` → `type` (in entity_aliases)
111
+ - Column: `fact_text` → `text` (in facts)
112
+ - **Terminology** - Replaced `type` with `kind` throughout the codebase for entity and content classification to avoid conflicts with Ruby's reserved `type` method
113
+
11
114
  ## [0.0.2] - 2025-01-08
12
115
 
13
116
  ### Fixed
data/README.md CHANGED
@@ -19,7 +19,9 @@ FactDb implements the Event Clock concept - capturing organizational knowledge t
19
19
  - <strong>Audit Trails</strong> - Every fact links back to source content<br>
20
20
  - <strong>Multiple Extractors</strong> - Extract facts manually, via LLM, or rule-based<br>
21
21
  - <strong>Semantic Search</strong> - PostgreSQL with pgvector<br>
22
- - <strong>Concurrent Processing</strong> - Batch process with parallel pipelines
22
+ - <strong>Concurrent Processing</strong> - Batch process with parallel pipelines<br>
23
+ - <strong>Output Formats</strong> - JSON, triples, Cypher, or text for LLM consumption<br>
24
+ - <strong>Temporal Queries</strong> - Fluent API for point-in-time queries and diffs
23
25
  </td>
24
26
  </tr>
25
27
  </table>
@@ -50,8 +52,10 @@ bundle install
50
52
  require 'fact_db'
51
53
 
52
54
  # Configure with a PostgreSQL database URL
55
+ # If you want to use an envar name different from the standard
56
+ # FDB_DATABASE__URL then you must set the config.database.url in code ...
53
57
  FactDb.configure do |config|
54
- config.database_url = ENV.fetch("DATABASE_URL", "postgres://#{ENV['USER']}@localhost/fact_db_demo")
58
+ config.database.url = ENV["YOUR_DATABASE_URL_ENVAR_NAME"]
55
59
  end
56
60
 
57
61
  # Run migrations to create the schema (only needed once)
@@ -61,19 +65,27 @@ FactDb::Database.migrate!
61
65
  facts = FactDb.new
62
66
  ```
63
67
 
68
+ Configuration uses nested sections. You can also use environment variables:
69
+
70
+ ```bash
71
+ export FDB_DATABASE__URL="postgresql://localhost/fact_db"
72
+ export FDB_LLM__PROVIDER="openai"
73
+ export FDB_LLM__API_KEY="sk-..."
74
+ ```
75
+
64
76
  Once configured, you can ingest content and create facts:
65
77
 
66
78
  ```ruby
67
79
  # Ingest content
68
80
  content = facts.ingest(
69
81
  "Paula Chen joined Microsoft as Principal Engineer on January 10, 2024.",
70
- type: :email,
82
+ kind: :email,
71
83
  captured_at: Time.now
72
84
  )
73
85
 
74
86
  # Create entities
75
- paula = facts.entity_service.create("Paula Chen", type: :person)
76
- microsoft = facts.entity_service.create("Microsoft", type: :organization)
87
+ paula = facts.entity_service.create("Paula Chen", kind: :person)
88
+ microsoft = facts.entity_service.create("Microsoft", kind: :organization)
77
89
 
78
90
  # Create a fact with entity mentions
79
91
  facts.fact_service.create(
@@ -91,17 +103,106 @@ Query facts temporally:
91
103
  ```ruby
92
104
  # Query current facts about Paula
93
105
  facts.current_facts_for(paula.id).each do |fact|
94
- puts fact.fact_text
106
+ puts fact.text
95
107
  end
96
108
 
97
109
  # Query facts at a point in time (before she joined)
98
110
  facts.facts_at(Date.new(2023, 6, 15), entity: paula.id)
99
111
  ```
100
112
 
113
+ ## Output Formats
114
+
115
+ Query results can be transformed into multiple formats for different use cases:
116
+
117
+ ```ruby
118
+ # Raw - original ActiveRecord objects for direct database access
119
+ results = facts.query_facts(topic: "Paula Chen", format: :raw)
120
+ results.each do |fact|
121
+ puts fact.text
122
+ puts fact.entity_mentions.map(&:entity).map(&:name)
123
+ end
124
+
125
+ # JSON (default) - structured hash
126
+ facts.query_facts(topic: "Paula Chen", format: :json)
127
+
128
+ # Triples - Subject-Predicate-Object for semantic encoding
129
+ facts.query_facts(topic: "Paula Chen", format: :triples)
130
+ # => [["Paula Chen", "kind", "Person"],
131
+ # ["Paula Chen", "works_at", "Microsoft"],
132
+ # ["Paula Chen", "works_at.valid_from", "2024-01-10"]]
133
+
134
+ # Cypher - graph notation with nodes and relationships
135
+ facts.query_facts(topic: "Paula Chen", format: :cypher)
136
+ # => (paula_chen:Person {name: "Paula Chen"})
137
+ # (microsoft:Organization {name: "Microsoft"})
138
+ # (paula_chen)-[:WORKS_AT {since: "2024-01-10"}]->(microsoft)
139
+
140
+ # Text - human-readable markdown
141
+ facts.query_facts(topic: "Paula Chen", format: :text)
142
+ ```
143
+
144
+ ## Temporal Query Builder
145
+
146
+ Use the fluent API for point-in-time queries:
147
+
148
+ ```ruby
149
+ # Query at a specific date
150
+ facts.at("2024-01-15").query("Paula's role", format: :cypher)
151
+
152
+ # Get all facts valid at a date
153
+ facts.at("2024-01-15").facts
154
+
155
+ # Get facts for a specific entity at that date
156
+ facts.at("2024-01-15").facts_for(paula.id)
157
+
158
+ # Compare what changed between two dates
159
+ facts.at("2024-01-15").compare_to("2024-06-15")
160
+ ```
161
+
162
+ ## Comparing Changes Over Time
163
+
164
+ Track what changed between two points in time:
165
+
166
+ ```ruby
167
+ diff = facts.diff("Paula Chen", from: "2024-01-01", to: "2024-06-01")
168
+
169
+ diff[:added] # Facts that became valid
170
+ diff[:removed] # Facts that were superseded
171
+ diff[:unchanged] # Facts that remained valid
172
+ ```
173
+
174
+ ## Introspection
175
+
176
+ Discover what the fact database knows about:
177
+
178
+ ```ruby
179
+ # Get schema and capabilities
180
+ facts.introspect
181
+ # => { capabilities: [:temporal_query, :entity_resolution, ...],
182
+ # entity_kinds: ["person", "organization", ...],
183
+ # output_formats: [:raw, :json, :triples, :cypher, :text],
184
+ # statistics: { facts: {...}, entities: {...} } }
185
+
186
+ # Get coverage for a specific topic
187
+ facts.introspect("Paula Chen")
188
+ # => { entity: {...}, coverage: {...}, relationships: [...],
189
+ # suggested_queries: ["current status", "employment history"] }
190
+
191
+ # Get query suggestions
192
+ facts.suggest_queries("Paula Chen")
193
+ # => ["current status", "employment history", "timeline"]
194
+
195
+ # Get retrieval strategy recommendations
196
+ facts.suggest_strategies("What happened last week?")
197
+ # => [{ strategy: :temporal, description: "Filter by date range" }]
198
+ ```
199
+
101
200
  ## Documentation
102
201
 
103
202
  Full documentation is available at **[https://madbomber.github.io/fact_db](https://madbomber.github.io/fact_db)**
104
203
 
204
+ API documentation (YARD) is available at **[https://madbomber.github.io/fact_db/yard](https://madbomber.github.io/fact_db/yard)**
205
+
105
206
  ## Examples
106
207
 
107
208
  See the [examples directory](examples/README.md) for runnable demo programs covering:
data/Rakefile CHANGED
@@ -1,6 +1,7 @@
1
1
  # frozen_string_literal: true
2
2
 
3
3
  require "bundler/gem_tasks"
4
+ require "fileutils"
4
5
  require "rake/testtask"
5
6
 
6
7
  Rake::TestTask.new(:test) do |t|
@@ -9,33 +10,275 @@ Rake::TestTask.new(:test) do |t|
9
10
  t.test_files = FileList["test/**/*_test.rb"]
10
11
  end
11
12
 
13
+ # Ensure test environment is set before running tests
14
+ task :set_test_env do
15
+ ENV["FDB_ENV"] = "test"
16
+ end
17
+
18
+ task test: [:set_test_env, "db:reset:test"]
19
+
12
20
  namespace :db do
21
+ desc "Drop the database"
22
+ task :drop do
23
+ require_relative "lib/fact_db"
24
+ puts "Environment: #{FactDb.config.environment}"
25
+ puts "Database: #{FactDb.config.database.name}"
26
+ FactDb::Database.drop!
27
+ end
28
+
29
+ desc "Create the database"
30
+ task :create do
31
+ require_relative "lib/fact_db"
32
+ puts "Environment: #{FactDb.config.environment}"
33
+ puts "Database: #{FactDb.config.database.name}"
34
+ FactDb::Database.create!
35
+ end
36
+
13
37
  desc "Run database migrations"
14
38
  task :migrate do
15
39
  require_relative "lib/fact_db"
16
- FactDb.configure do |config|
17
- config.database_url = ENV.fetch("DATABASE_URL")
18
- end
40
+ puts "Environment: #{FactDb.config.environment}"
41
+ puts "Database: #{FactDb.config.database.name}"
19
42
  FactDb::Database.migrate!
20
43
  end
21
44
 
22
45
  desc "Rollback the last migration"
23
46
  task :rollback do
24
47
  require_relative "lib/fact_db"
25
- FactDb.configure do |config|
26
- config.database_url = ENV.fetch("DATABASE_URL")
27
- end
48
+ puts "Environment: #{FactDb.config.environment}"
49
+ puts "Database: #{FactDb.config.database.name}"
28
50
  FactDb::Database.rollback!
29
51
  end
30
52
 
31
- desc "Reset the database (drop, create, migrate)"
53
+ desc "Reset the database (drop, create, migrate) - honors FDB_ENV"
32
54
  task :reset do
33
55
  require_relative "lib/fact_db"
34
- FactDb.configure do |config|
35
- config.database_url = ENV.fetch("DATABASE_URL")
36
- end
56
+ puts "Environment: #{FactDb.config.environment}"
57
+ puts "Database: #{FactDb.config.database.name}"
37
58
  FactDb::Database.reset!
38
59
  end
60
+
61
+ namespace :reset do
62
+ def reset_for_environment(env_name)
63
+ original_env = ENV["FDB_ENV"]
64
+ ENV["FDB_ENV"] = env_name
65
+
66
+ require_relative "lib/fact_db"
67
+ Anyway::Settings.current_environment = env_name
68
+ FactDb.reset_configuration!
69
+
70
+ puts "Environment: #{FactDb.config.environment}"
71
+ puts "Database: #{FactDb.config.database.name}"
72
+ FactDb::Database.reset!
73
+ ensure
74
+ ENV["FDB_ENV"] = original_env
75
+ Anyway::Settings.current_environment = original_env || "development"
76
+ FactDb.reset_configuration!
77
+ end
78
+
79
+ desc "Reset development database"
80
+ task :development do
81
+ reset_for_environment("development")
82
+ end
83
+
84
+ desc "Reset test database"
85
+ task :test do
86
+ reset_for_environment("test")
87
+ end
88
+
89
+ desc "Reset demo database"
90
+ task :demo do
91
+ reset_for_environment("demo")
92
+ end
93
+
94
+ desc "Reset all databases (development, test, demo)"
95
+ task :all do
96
+ %w[development test demo].each do |env_name|
97
+ puts "\n#{"=" * 50}"
98
+ reset_for_environment(env_name)
99
+ end
100
+ puts "\n#{"=" * 50}"
101
+ puts "All databases reset."
102
+ end
103
+ end
104
+
105
+ namespace :schema do
106
+ desc "Dump database schema to db/schema.sql"
107
+ task :dump do
108
+ require_relative "lib/fact_db"
109
+ db = FactDb.config.database
110
+
111
+ puts "Environment: #{FactDb.config.environment}"
112
+ puts "Database: #{db.name}"
113
+
114
+ schema_file = File.expand_path("db/schema.sql", __dir__)
115
+ host = db.host || "localhost"
116
+ port = db.port || 5432
117
+ user = db.username || ENV["USER"]
118
+
119
+ cmd = ["pg_dump", "--schema-only", "--no-owner", "--no-acl"]
120
+ cmd += ["-h", host, "-p", port.to_s]
121
+ cmd += ["-U", user] if user
122
+ cmd << db.name
123
+
124
+ File.open(schema_file, "w") do |f|
125
+ f.puts "-- Schema dump for #{db.name}"
126
+ f.puts "-- Generated: #{Time.now.utc.iso8601}"
127
+ f.puts "-- Environment: #{FactDb.config.environment}"
128
+ f.puts
129
+ end
130
+
131
+ system(*cmd, out: [schema_file, "a"]) || abort("pg_dump failed")
132
+ puts "Schema dumped to #{schema_file}"
133
+ end
134
+
135
+ desc "Load database schema from db/schema.sql"
136
+ task :load do
137
+ require_relative "lib/fact_db"
138
+ db = FactDb.config.database
139
+
140
+ puts "Environment: #{FactDb.config.environment}"
141
+ puts "Database: #{db.name}"
142
+
143
+ schema_file = File.expand_path("db/schema.sql", __dir__)
144
+ abort("Schema file not found: #{schema_file}") unless File.exist?(schema_file)
145
+
146
+ host = db.host || "localhost"
147
+ port = db.port || 5432
148
+ user = db.username || ENV["USER"]
149
+
150
+ cmd = ["psql", "-q"]
151
+ cmd += ["-h", host, "-p", port.to_s]
152
+ cmd += ["-U", user] if user
153
+ cmd += ["-d", db.name, "-f", schema_file]
154
+
155
+ system(*cmd) || abort("psql failed")
156
+ puts "Schema loaded from #{schema_file}"
157
+ end
158
+ end
159
+
160
+ desc "Dump database to file. DIR=path (default: .)"
161
+ task :dump do
162
+ require_relative "lib/fact_db"
163
+ db = FactDb.config.database
164
+
165
+ puts "Environment: #{FactDb.config.environment}"
166
+ puts "Database: #{db.name}"
167
+
168
+ dumps_dir = ENV["DIR"] || Dir.pwd
169
+ timestamp = Time.now.strftime("%Y%m%d_%H%M%S")
170
+ dump_file = File.join(dumps_dir, "fact_db_#{FactDb.config.environment}_#{timestamp}.dump")
171
+
172
+ host = db.host || "localhost"
173
+ port = db.port || 5432
174
+ user = db.username || ENV["USER"]
175
+
176
+ cmd = ["pg_dump", "-Fc", "--no-owner", "--no-acl"]
177
+ cmd += ["-h", host, "-p", port.to_s]
178
+ cmd += ["-U", user] if user
179
+ cmd += ["-f", dump_file]
180
+ cmd << db.name
181
+
182
+ system(*cmd) || abort("pg_dump failed")
183
+ puts "Database dumped to #{dump_file}"
184
+ end
185
+
186
+ desc "Restore database from dump file. DIR=path (default: .)"
187
+ task :restore do
188
+ require_relative "lib/fact_db"
189
+ db = FactDb.config.database
190
+
191
+ dumps_dir = ENV["DIR"] || Dir.pwd
192
+ pattern = File.join(dumps_dir, "fact_db_#{FactDb.config.environment}_*.dump")
193
+ dump_files = Dir.glob(pattern).sort
194
+ abort("No dump files found matching: #{pattern}") if dump_files.empty?
195
+
196
+ dump_file = if dump_files.size == 1
197
+ dump_files.first
198
+ else
199
+ puts "Available dump files:"
200
+ dump_files.each_with_index do |file, index|
201
+ puts " #{index + 1}. #{File.basename(file)}"
202
+ end
203
+ print "\nSelect file (1-#{dump_files.size}): "
204
+ choice = $stdin.gets.to_i
205
+ abort("Invalid selection") unless choice.between?(1, dump_files.size)
206
+ dump_files[choice - 1]
207
+ end
208
+
209
+ puts "Environment: #{FactDb.config.environment}"
210
+ puts "Database: #{db.name}"
211
+ puts "Restoring from: #{dump_file}"
212
+
213
+ host = db.host || "localhost"
214
+ port = db.port || 5432
215
+ user = db.username || ENV["USER"]
216
+
217
+ cmd = ["pg_restore", "--no-owner", "--no-acl", "-d", db.name]
218
+ cmd += ["-h", host, "-p", port.to_s]
219
+ cmd += ["-U", user] if user
220
+ cmd << dump_file
221
+
222
+ system(*cmd) || abort("pg_restore failed")
223
+ puts "Database restored from #{dump_file}"
224
+ end
225
+
226
+ desc "Clean up invalid aliases (pronouns, generic terms). Use EXECUTE=1 to apply changes."
227
+ task :cleanup_aliases do
228
+ require_relative "lib/fact_db"
229
+ puts "Environment: #{FactDb.config.environment}"
230
+ puts "Database: #{FactDb.config.database.name}"
231
+ FactDb::Database.establish_connection!
232
+
233
+ dry_run = ENV["EXECUTE"] != "1"
234
+ stats = { checked: 0, removed: 0 }
235
+
236
+ puts dry_run ? "\n=== DRY RUN ===" : "\n=== EXECUTING ==="
237
+ puts
238
+
239
+ FactDb::Models::Entity.not_merged.find_each do |entity|
240
+ entity.aliases.each do |alias_record|
241
+ stats[:checked] += 1
242
+ next if FactDb::Validation::AliasFilter.valid?(alias_record.name, name: entity.name)
243
+
244
+ reason = FactDb::Validation::AliasFilter.rejection_reason(alias_record.name, name: entity.name)
245
+ puts "#{entity.name}: removing \"#{alias_record.name}\" (#{reason})"
246
+ alias_record.destroy unless dry_run
247
+ stats[:removed] += 1
248
+ end
249
+ end
250
+
251
+ puts "\nChecked: #{stats[:checked]}, Removed: #{stats[:removed]}"
252
+ puts "\nRun with EXECUTE=1 to apply changes." if dry_run && stats[:removed] > 0
253
+ end
254
+ end
255
+
256
+ namespace :docs do
257
+ desc "Build mkdocs documentation site"
258
+ task :mkdocs do
259
+ output_dir = File.expand_path("site", __dir__)
260
+ system("mkdocs", "build", "--clean") || abort("mkdocs build failed")
261
+ puts "MkDocs site built to #{output_dir}"
262
+ end
263
+
264
+ desc "Build YARD API documentation"
265
+ task :yard do
266
+ output_dir = File.expand_path("doc", __dir__)
267
+ system("yard", "doc") || abort("yard doc failed")
268
+
269
+ # Create symlink for README.md image path (docs/assets -> assets)
270
+ # README.md references docs/assets/fact_db.jpg which needs to resolve in YARD output
271
+ docs_dir = File.join(output_dir, "docs")
272
+ FileUtils.mkdir_p(docs_dir)
273
+ symlink_path = File.join(docs_dir, "assets")
274
+ FileUtils.rm_f(symlink_path)
275
+ FileUtils.ln_sf("../assets", symlink_path)
276
+
277
+ puts "YARD documentation built to #{output_dir}"
278
+ end
279
+
280
+ desc "Build all documentation (mkdocs and YARD)"
281
+ task all: [:mkdocs, :yard]
39
282
  end
40
283
 
41
284
  task default: :test
@@ -3,5 +3,6 @@
3
3
  class EnableExtensions < ActiveRecord::Migration[7.0]
4
4
  def change
5
5
  enable_extension "vector" unless extension_enabled?("vector")
6
+ enable_extension "pg_trgm" unless extension_enabled?("pg_trgm")
6
7
  end
7
8
  end
@@ -0,0 +1,49 @@
1
+ # frozen_string_literal: true
2
+
3
+ class CreateSources < ActiveRecord::Migration[7.0]
4
+ def change
5
+ create_table :fact_db_sources, comment: "Stores immutable source content from which facts are extracted" do |t|
6
+ t.string :content_hash, null: false, limit: 64,
7
+ comment: "SHA-256 hash of content for deduplication and integrity verification"
8
+ t.string :kind, null: false, limit: 50,
9
+ comment: "Classification of content origin (e.g., email, document, webpage, transcript)"
10
+
11
+ t.text :content, null: false,
12
+ comment: "Original unmodified text content, preserved for audit and re-extraction"
13
+ t.string :title, limit: 500,
14
+ comment: "Human-readable title or subject line of the content"
15
+
16
+ t.text :source_uri,
17
+ comment: "URI identifying the original source location (URL, file path, message ID)"
18
+ t.jsonb :metadata, null: false, default: {},
19
+ comment: "Flexible metadata about the source (author, date, headers, etc.)"
20
+
21
+ t.vector :embedding, limit: 1536,
22
+ comment: "Vector embedding for semantic similarity search (OpenAI ada-002 compatible)"
23
+
24
+ t.timestamptz :captured_at, null: false,
25
+ comment: "When the content was originally captured or received"
26
+ t.timestamps
27
+ end
28
+
29
+ add_index :fact_db_sources, :content_hash, unique: true
30
+ add_index :fact_db_sources, :captured_at
31
+ add_index :fact_db_sources, :kind
32
+ add_index :fact_db_sources, :metadata, using: :gin
33
+
34
+ # Full-text search index
35
+ execute <<-SQL
36
+ CREATE INDEX idx_sources_fulltext ON fact_db_sources
37
+ USING gin(to_tsvector('english', content));
38
+ SQL
39
+
40
+ # HNSW index for vector similarity search
41
+ execute <<-SQL
42
+ CREATE INDEX idx_sources_embedding ON fact_db_sources
43
+ USING hnsw (embedding vector_cosine_ops);
44
+ SQL
45
+
46
+ execute "COMMENT ON COLUMN fact_db_sources.created_at IS 'When this record was created in the database';"
47
+ execute "COMMENT ON COLUMN fact_db_sources.updated_at IS 'When this record was last modified';"
48
+ end
49
+ end