fact_db 0.0.2 → 0.0.3

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (114) hide show
  1. checksums.yaml +4 -4
  2. data/.envrc +2 -0
  3. data/.yardopts +5 -0
  4. data/CHANGELOG.md +64 -0
  5. data/README.md +107 -6
  6. data/Rakefile +243 -10
  7. data/db/migrate/001_enable_extensions.rb +1 -0
  8. data/db/migrate/002_create_sources.rb +49 -0
  9. data/db/migrate/003_create_entities.rb +27 -15
  10. data/db/migrate/004_create_entity_aliases.rb +20 -7
  11. data/db/migrate/005_create_facts.rb +37 -21
  12. data/db/migrate/006_create_entity_mentions.rb +14 -6
  13. data/db/migrate/007_create_fact_sources.rb +16 -8
  14. data/docs/api/extractors/index.md +5 -5
  15. data/docs/api/extractors/llm.md +17 -17
  16. data/docs/api/extractors/rule-based.md +14 -14
  17. data/docs/api/facts.md +20 -20
  18. data/docs/api/index.md +4 -4
  19. data/docs/api/models/entity.md +21 -21
  20. data/docs/api/models/fact.md +15 -15
  21. data/docs/api/models/index.md +7 -7
  22. data/docs/api/models/{content.md → source.md} +29 -29
  23. data/docs/api/pipeline/extraction.md +25 -25
  24. data/docs/api/pipeline/index.md +1 -1
  25. data/docs/api/pipeline/resolution.md +4 -4
  26. data/docs/api/services/entity-service.md +20 -20
  27. data/docs/api/services/fact-service.md +12 -12
  28. data/docs/api/services/index.md +5 -5
  29. data/docs/api/services/{content-service.md → source-service.md} +27 -27
  30. data/docs/architecture/database-schema.md +46 -46
  31. data/docs/architecture/entity-resolution.md +6 -6
  32. data/docs/architecture/index.md +10 -10
  33. data/docs/architecture/temporal-facts.md +5 -5
  34. data/docs/architecture/three-layer-model.md +17 -17
  35. data/docs/concepts.md +6 -6
  36. data/docs/examples/basic-usage.md +20 -20
  37. data/docs/examples/hr-onboarding.md +17 -17
  38. data/docs/examples/index.md +4 -4
  39. data/docs/examples/news-analysis.md +23 -23
  40. data/docs/getting-started/database-setup.md +28 -20
  41. data/docs/getting-started/index.md +3 -3
  42. data/docs/getting-started/quick-start.md +33 -30
  43. data/docs/guides/batch-processing.md +26 -26
  44. data/docs/guides/configuration.md +158 -77
  45. data/docs/guides/entity-management.md +14 -14
  46. data/docs/guides/extracting-facts.md +28 -28
  47. data/docs/guides/ingesting-content.md +14 -14
  48. data/docs/guides/llm-integration.md +40 -32
  49. data/docs/guides/temporal-queries.md +11 -11
  50. data/docs/index.md +6 -2
  51. data/examples/.envrc +4 -0
  52. data/examples/.gitignore +1 -0
  53. data/examples/001_configuration.rb +312 -0
  54. data/examples/{basic_usage.rb → 010_basic_usage.rb} +47 -56
  55. data/examples/{entity_management.rb → 020_entity_management.rb} +57 -72
  56. data/examples/{temporal_queries.rb → 030_temporal_queries.rb} +39 -59
  57. data/examples/040_output_formats.rb +177 -0
  58. data/examples/{rule_based_extraction.rb → 050_rule_based_extraction.rb} +39 -45
  59. data/examples/060_fluent_temporal_api.rb +217 -0
  60. data/examples/070_introspection.rb +252 -0
  61. data/examples/{hr_system.rb → 080_hr_system.rb} +56 -75
  62. data/examples/090_ingest_demo.rb +515 -0
  63. data/examples/100_query_context.rb +668 -0
  64. data/examples/110_prove_it.rb +204 -0
  65. data/examples/120_dump_database.rb +358 -0
  66. data/examples/130_rag_feedback_loop.rb +858 -0
  67. data/examples/README.md +229 -15
  68. data/examples/data/lincoln_associates.md +201 -0
  69. data/examples/data/lincoln_biography.md +66 -0
  70. data/examples/data/lincoln_cabinet.md +243 -0
  71. data/examples/data/lincoln_family.md +163 -0
  72. data/examples/data/lincoln_military.md +241 -0
  73. data/examples/data/lincoln_todd_family.md +136 -0
  74. data/examples/ingest_reporter.rb +335 -0
  75. data/examples/utilities.rb +182 -0
  76. data/lib/fact_db/config/defaults.yml +254 -0
  77. data/lib/fact_db/config.rb +94 -35
  78. data/lib/fact_db/database.rb +98 -8
  79. data/lib/fact_db/extractors/base.rb +106 -21
  80. data/lib/fact_db/extractors/llm_extractor.rb +35 -63
  81. data/lib/fact_db/extractors/manual_extractor.rb +46 -6
  82. data/lib/fact_db/extractors/rule_based_extractor.rb +136 -25
  83. data/lib/fact_db/llm/adapter.rb +3 -3
  84. data/lib/fact_db/models/entity.rb +94 -22
  85. data/lib/fact_db/models/entity_alias.rb +41 -7
  86. data/lib/fact_db/models/entity_mention.rb +34 -1
  87. data/lib/fact_db/models/fact.rb +259 -28
  88. data/lib/fact_db/models/fact_source.rb +43 -9
  89. data/lib/fact_db/models/source.rb +113 -0
  90. data/lib/fact_db/pipeline/extraction_pipeline.rb +35 -35
  91. data/lib/fact_db/pipeline/resolution_pipeline.rb +5 -5
  92. data/lib/fact_db/query_result.rb +202 -0
  93. data/lib/fact_db/resolution/entity_resolver.rb +139 -39
  94. data/lib/fact_db/resolution/fact_resolver.rb +86 -14
  95. data/lib/fact_db/services/entity_service.rb +246 -37
  96. data/lib/fact_db/services/fact_service.rb +254 -17
  97. data/lib/fact_db/services/source_service.rb +164 -0
  98. data/lib/fact_db/temporal/query.rb +71 -7
  99. data/lib/fact_db/temporal/query_builder.rb +69 -0
  100. data/lib/fact_db/temporal/timeline.rb +102 -11
  101. data/lib/fact_db/transformers/base.rb +77 -0
  102. data/lib/fact_db/transformers/cypher_transformer.rb +185 -0
  103. data/lib/fact_db/transformers/json_transformer.rb +17 -0
  104. data/lib/fact_db/transformers/raw_transformer.rb +35 -0
  105. data/lib/fact_db/transformers/text_transformer.rb +114 -0
  106. data/lib/fact_db/transformers/triple_transformer.rb +138 -0
  107. data/lib/fact_db/validation/alias_filter.rb +185 -0
  108. data/lib/fact_db/version.rb +1 -1
  109. data/lib/fact_db.rb +281 -30
  110. data/mkdocs.yml +2 -2
  111. metadata +60 -16
  112. data/db/migrate/002_create_contents.rb +0 -44
  113. data/lib/fact_db/models/content.rb +0 -62
  114. data/lib/fact_db/services/content_service.rb +0 -93
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: a87c848e8b274c26d7960e6ac91c5efaf3238b568fb264e2c92e3d37548aa398
4
- data.tar.gz: 72d964331f79436f8efefc0627896709fe2a32487195da21a180504c15c67082
3
+ metadata.gz: c9a22512c569e81df1cd3e7d216dc1356c043c8e454e631df14d3fadffd13b39
4
+ data.tar.gz: 260f0183ffc6a7166d2111a953215c7835ab3fd47e213474d2eda66d2f8ea582
5
5
  SHA512:
6
- metadata.gz: 74c901b77d7081e53ff87dc81800cc4cb862b83de96ce1e8b15de2f46e0b4d7b8e7b91daaaaedaca9fa723c36da9b0142fa68c2a9583a1f52731b7642ed29245
7
- data.tar.gz: 6f7359db8aaaa3c60c8fba762d4c96157834f2427e4295e760b74671ca30dfa4340c2070078d287b9263d77be7e8f89cb6a6731792a4273cc3233d4bb11aba81
6
+ metadata.gz: 539a7bef88cb16f6f590d6227a5b0820347197d679cc187c9e34990381f5b6a550ed60b1733d829da1b5c0e6f67e3b0dc7bc0d9bfda606acad282914c7db9fbe
7
+ data.tar.gz: 4c7e4b859af803c853cd2ce7d1a7aeba7c0d45f34934483d2caacd984eae29f6cf783803de19be071ca51f0b5df41b0e14bdf287a0d4815e42b3e5b2cedb7131
data/.envrc CHANGED
@@ -1 +1,3 @@
1
1
  export RR=`pwd`
2
+
3
+ export FDB_ENV=development
data/.yardopts ADDED
@@ -0,0 +1,5 @@
1
+ --output-dir doc
2
+ --readme README.md
3
+ --markup markdown
4
+ --asset docs/assets:assets
5
+ lib/**/*.rb
data/CHANGELOG.md CHANGED
@@ -8,6 +8,70 @@ All notable changes to this project will be documented in this file.
8
8
  The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.1.0/),
9
9
  and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html).
10
10
 
11
+ ## [0.0.3] - 2026-01-12
12
+
13
+ ### Added
14
+
15
+ - **Rake Tasks** - New database and documentation tasks
16
+ - `db:dump` - Dump database to file with timestamped naming convention
17
+ - `db:restore` - Restore database from dump file with interactive selection
18
+ - `db:schema:dump` - Dump database schema to `db/schema.sql`
19
+ - `db:schema:load` - Load database schema from `db/schema.sql`
20
+ - `docs:mkdocs` - Build MkDocs documentation site
21
+ - `docs:yard` - Build YARD API documentation
22
+ - `docs:all` - Build all documentation
23
+ - **YARD Documentation** - API documentation with GitHub Pages deployment
24
+ - Added `.yardopts` configuration
25
+ - Added GitHub Actions workflow for YARD deployment
26
+ - YARD docs available at `/yard` subdirectory on GitHub Pages
27
+ - **Trigram Search** - Added pg_trgm extension for fuzzy text matching
28
+ - **RAG Feedback Loop Example** - New example demonstrating retrieval-augmented generation patterns
29
+ - **Output Transformers** - Transform query results into multiple formats optimized for LLM consumption
30
+ - `RawTransformer` - Returns original ActiveRecord objects unchanged for direct database access
31
+ - `JsonTransformer` - JSON-serializable hash format (default)
32
+ - `TripleTransformer` - Subject-Predicate-Object triples for semantic encoding
33
+ - `CypherTransformer` - Cypher-like graph notation with nodes and relationships
34
+ - `TextTransformer` - Human-readable markdown format grouped by fact status
35
+ - **QueryResult** - Unified container for query results that works with all transformers
36
+ - Normalizes facts from ActiveRecord objects or hashes
37
+ - Resolves and caches entities referenced in facts
38
+ - Provides iteration methods (`each_fact`, `each_entity`)
39
+ - **Temporal Query Builder** - Fluent API for point-in-time queries via `facts.at(date)`
40
+ - Chain queries: `facts.at("2024-01-15").query("Paula's role", format: :cypher)`
41
+ - Get facts for entity: `facts.at("2024-01-15").facts_for(entity_id)`
42
+ - Compare dates: `facts.at("2024-01-15").compare_to("2024-06-15")`
43
+ - **Temporal Diff** - Compare what changed between two dates with `facts.diff(topic, from:, to:)`
44
+ - Returns `:added`, `:removed`, and `:unchanged` fact arrays
45
+ - **Introspection API** - Discover what the fact database knows about
46
+ - `facts.introspect` - Get schema, capabilities, entity types, and statistics
47
+ - `facts.introspect("Paula Chen")` - Get coverage and relationships for a topic
48
+ - `facts.suggest_queries(topic)` - Get suggested queries based on stored data
49
+ - `facts.suggest_strategies(query)` - Get recommended retrieval strategies
50
+ - **Format Parameter** - All query methods now accept `format:` parameter
51
+ - Available formats: `:raw`, `:json`, `:triples`, `:cypher`, `:text`
52
+ - Example: `facts.query_facts(topic: "Paula", format: :cypher)`
53
+
54
+ ### Changed
55
+
56
+ - **Configuration** - Replaced `anyway_config` with `myway_config` for configuration management
57
+ - Added environment-specific configuration support
58
+ - `EntityService` now includes `relationship_types_for(entity_id)` and `timespan_for(entity_id)` methods
59
+ - `FactService` now includes `fact_stats(entity_id)` for per-entity statistics
60
+
61
+ ### Breaking Changes
62
+
63
+ - **Database Schema Renames** - Multiple columns and tables renamed for consistency
64
+ - Table: `contents` → `sources`
65
+ - Column: `content_type` → `type` (in sources)
66
+ - Column: `source_metadata` → `metadata` (in sources)
67
+ - Column: `entity_type` → `type` (in entities)
68
+ - Column: `canonical_name` → `name` (in entities)
69
+ - Column: `merged_into_id` → `canonical_id` (in entities)
70
+ - Column: `alias_text` → `name` (in entity_aliases)
71
+ - Column: `alias_type` → `type` (in entity_aliases)
72
+ - Column: `fact_text` → `text` (in facts)
73
+ - **Terminology** - Replaced `type` with `kind` throughout the codebase for entity and content classification to avoid conflicts with Ruby's reserved `type` method
74
+
11
75
  ## [0.0.2] - 2025-01-08
12
76
 
13
77
  ### Fixed
data/README.md CHANGED
@@ -19,7 +19,9 @@ FactDb implements the Event Clock concept - capturing organizational knowledge t
19
19
  - <strong>Audit Trails</strong> - Every fact links back to source content<br>
20
20
  - <strong>Multiple Extractors</strong> - Extract facts manually, via LLM, or rule-based<br>
21
21
  - <strong>Semantic Search</strong> - PostgreSQL with pgvector<br>
22
- - <strong>Concurrent Processing</strong> - Batch process with parallel pipelines
22
+ - <strong>Concurrent Processing</strong> - Batch process with parallel pipelines<br>
23
+ - <strong>Output Formats</strong> - JSON, triples, Cypher, or text for LLM consumption<br>
24
+ - <strong>Temporal Queries</strong> - Fluent API for point-in-time queries and diffs
23
25
  </td>
24
26
  </tr>
25
27
  </table>
@@ -50,8 +52,10 @@ bundle install
50
52
  require 'fact_db'
51
53
 
52
54
  # Configure with a PostgreSQL database URL
55
+ # If you want to use an envar name different from the standard
56
+ # FDB_DATABASE__URL then you must set the config.database.url in code ...
53
57
  FactDb.configure do |config|
54
- config.database_url = ENV.fetch("DATABASE_URL", "postgres://#{ENV['USER']}@localhost/fact_db_demo")
58
+ config.database.url = ENV["YOUR_DATABASE_URL_ENVAR_NAME"]
55
59
  end
56
60
 
57
61
  # Run migrations to create the schema (only needed once)
@@ -61,19 +65,27 @@ FactDb::Database.migrate!
61
65
  facts = FactDb.new
62
66
  ```
63
67
 
68
+ Configuration uses nested sections. You can also use environment variables:
69
+
70
+ ```bash
71
+ export FDB_DATABASE__URL="postgresql://localhost/fact_db"
72
+ export FDB_LLM__PROVIDER="openai"
73
+ export FDB_LLM__API_KEY="sk-..."
74
+ ```
75
+
64
76
  Once configured, you can ingest content and create facts:
65
77
 
66
78
  ```ruby
67
79
  # Ingest content
68
80
  content = facts.ingest(
69
81
  "Paula Chen joined Microsoft as Principal Engineer on January 10, 2024.",
70
- type: :email,
82
+ kind: :email,
71
83
  captured_at: Time.now
72
84
  )
73
85
 
74
86
  # Create entities
75
- paula = facts.entity_service.create("Paula Chen", type: :person)
76
- microsoft = facts.entity_service.create("Microsoft", type: :organization)
87
+ paula = facts.entity_service.create("Paula Chen", kind: :person)
88
+ microsoft = facts.entity_service.create("Microsoft", kind: :organization)
77
89
 
78
90
  # Create a fact with entity mentions
79
91
  facts.fact_service.create(
@@ -91,17 +103,106 @@ Query facts temporally:
91
103
  ```ruby
92
104
  # Query current facts about Paula
93
105
  facts.current_facts_for(paula.id).each do |fact|
94
- puts fact.fact_text
106
+ puts fact.text
95
107
  end
96
108
 
97
109
  # Query facts at a point in time (before she joined)
98
110
  facts.facts_at(Date.new(2023, 6, 15), entity: paula.id)
99
111
  ```
100
112
 
113
+ ## Output Formats
114
+
115
+ Query results can be transformed into multiple formats for different use cases:
116
+
117
+ ```ruby
118
+ # Raw - original ActiveRecord objects for direct database access
119
+ results = facts.query_facts(topic: "Paula Chen", format: :raw)
120
+ results.each do |fact|
121
+ puts fact.text
122
+ puts fact.entity_mentions.map(&:entity).map(&:name)
123
+ end
124
+
125
+ # JSON (default) - structured hash
126
+ facts.query_facts(topic: "Paula Chen", format: :json)
127
+
128
+ # Triples - Subject-Predicate-Object for semantic encoding
129
+ facts.query_facts(topic: "Paula Chen", format: :triples)
130
+ # => [["Paula Chen", "kind", "Person"],
131
+ # ["Paula Chen", "works_at", "Microsoft"],
132
+ # ["Paula Chen", "works_at.valid_from", "2024-01-10"]]
133
+
134
+ # Cypher - graph notation with nodes and relationships
135
+ facts.query_facts(topic: "Paula Chen", format: :cypher)
136
+ # => (paula_chen:Person {name: "Paula Chen"})
137
+ # (microsoft:Organization {name: "Microsoft"})
138
+ # (paula_chen)-[:WORKS_AT {since: "2024-01-10"}]->(microsoft)
139
+
140
+ # Text - human-readable markdown
141
+ facts.query_facts(topic: "Paula Chen", format: :text)
142
+ ```
143
+
144
+ ## Temporal Query Builder
145
+
146
+ Use the fluent API for point-in-time queries:
147
+
148
+ ```ruby
149
+ # Query at a specific date
150
+ facts.at("2024-01-15").query("Paula's role", format: :cypher)
151
+
152
+ # Get all facts valid at a date
153
+ facts.at("2024-01-15").facts
154
+
155
+ # Get facts for a specific entity at that date
156
+ facts.at("2024-01-15").facts_for(paula.id)
157
+
158
+ # Compare what changed between two dates
159
+ facts.at("2024-01-15").compare_to("2024-06-15")
160
+ ```
161
+
162
+ ## Comparing Changes Over Time
163
+
164
+ Track what changed between two points in time:
165
+
166
+ ```ruby
167
+ diff = facts.diff("Paula Chen", from: "2024-01-01", to: "2024-06-01")
168
+
169
+ diff[:added] # Facts that became valid
170
+ diff[:removed] # Facts that were superseded
171
+ diff[:unchanged] # Facts that remained valid
172
+ ```
173
+
174
+ ## Introspection
175
+
176
+ Discover what the fact database knows about:
177
+
178
+ ```ruby
179
+ # Get schema and capabilities
180
+ facts.introspect
181
+ # => { capabilities: [:temporal_query, :entity_resolution, ...],
182
+ # entity_kinds: ["person", "organization", ...],
183
+ # output_formats: [:raw, :json, :triples, :cypher, :text],
184
+ # statistics: { facts: {...}, entities: {...} } }
185
+
186
+ # Get coverage for a specific topic
187
+ facts.introspect("Paula Chen")
188
+ # => { entity: {...}, coverage: {...}, relationships: [...],
189
+ # suggested_queries: ["current status", "employment history"] }
190
+
191
+ # Get query suggestions
192
+ facts.suggest_queries("Paula Chen")
193
+ # => ["current status", "employment history", "timeline"]
194
+
195
+ # Get retrieval strategy recommendations
196
+ facts.suggest_strategies("What happened last week?")
197
+ # => [{ strategy: :temporal, description: "Filter by date range" }]
198
+ ```
199
+
101
200
  ## Documentation
102
201
 
103
202
  Full documentation is available at **[https://madbomber.github.io/fact_db](https://madbomber.github.io/fact_db)**
104
203
 
204
+ API documentation (YARD) is available at **[https://madbomber.github.io/fact_db/yard](https://madbomber.github.io/fact_db/yard)**
205
+
105
206
  ## Examples
106
207
 
107
208
  See the [examples directory](examples/README.md) for runnable demo programs covering:
data/Rakefile CHANGED
@@ -9,33 +9,266 @@ Rake::TestTask.new(:test) do |t|
9
9
  t.test_files = FileList["test/**/*_test.rb"]
10
10
  end
11
11
 
12
+ # Ensure test environment is set before running tests
13
+ task :set_test_env do
14
+ ENV["FDB_ENV"] = "test"
15
+ end
16
+
17
+ task test: [:set_test_env, "db:reset:test"]
18
+
12
19
  namespace :db do
20
+ desc "Drop the database"
21
+ task :drop do
22
+ require_relative "lib/fact_db"
23
+ puts "Environment: #{FactDb.config.environment}"
24
+ puts "Database: #{FactDb.config.database.name}"
25
+ FactDb::Database.drop!
26
+ end
27
+
28
+ desc "Create the database"
29
+ task :create do
30
+ require_relative "lib/fact_db"
31
+ puts "Environment: #{FactDb.config.environment}"
32
+ puts "Database: #{FactDb.config.database.name}"
33
+ FactDb::Database.create!
34
+ end
35
+
13
36
  desc "Run database migrations"
14
37
  task :migrate do
15
38
  require_relative "lib/fact_db"
16
- FactDb.configure do |config|
17
- config.database_url = ENV.fetch("DATABASE_URL")
18
- end
39
+ puts "Environment: #{FactDb.config.environment}"
40
+ puts "Database: #{FactDb.config.database.name}"
19
41
  FactDb::Database.migrate!
20
42
  end
21
43
 
22
44
  desc "Rollback the last migration"
23
45
  task :rollback do
24
46
  require_relative "lib/fact_db"
25
- FactDb.configure do |config|
26
- config.database_url = ENV.fetch("DATABASE_URL")
27
- end
47
+ puts "Environment: #{FactDb.config.environment}"
48
+ puts "Database: #{FactDb.config.database.name}"
28
49
  FactDb::Database.rollback!
29
50
  end
30
51
 
31
- desc "Reset the database (drop, create, migrate)"
52
+ desc "Reset the database (drop, create, migrate) - honors FDB_ENV"
32
53
  task :reset do
33
54
  require_relative "lib/fact_db"
34
- FactDb.configure do |config|
35
- config.database_url = ENV.fetch("DATABASE_URL")
36
- end
55
+ puts "Environment: #{FactDb.config.environment}"
56
+ puts "Database: #{FactDb.config.database.name}"
37
57
  FactDb::Database.reset!
38
58
  end
59
+
60
+ namespace :reset do
61
+ def reset_for_environment(env_name)
62
+ original_env = ENV["FDB_ENV"]
63
+ ENV["FDB_ENV"] = env_name
64
+
65
+ require_relative "lib/fact_db"
66
+ Anyway::Settings.current_environment = env_name
67
+ FactDb.reset_configuration!
68
+
69
+ puts "Environment: #{FactDb.config.environment}"
70
+ puts "Database: #{FactDb.config.database.name}"
71
+ FactDb::Database.reset!
72
+ ensure
73
+ ENV["FDB_ENV"] = original_env
74
+ Anyway::Settings.current_environment = original_env || "development"
75
+ FactDb.reset_configuration!
76
+ end
77
+
78
+ desc "Reset development database"
79
+ task :development do
80
+ reset_for_environment("development")
81
+ end
82
+
83
+ desc "Reset test database"
84
+ task :test do
85
+ reset_for_environment("test")
86
+ end
87
+
88
+ desc "Reset demo database"
89
+ task :demo do
90
+ reset_for_environment("demo")
91
+ end
92
+
93
+ desc "Reset all databases (development, test, demo)"
94
+ task :all do
95
+ %w[development test demo].each do |env_name|
96
+ puts "\n#{"=" * 50}"
97
+ reset_for_environment(env_name)
98
+ end
99
+ puts "\n#{"=" * 50}"
100
+ puts "All databases reset."
101
+ end
102
+ end
103
+
104
+ namespace :schema do
105
+ desc "Dump database schema to db/schema.sql"
106
+ task :dump do
107
+ require_relative "lib/fact_db"
108
+ db = FactDb.config.database
109
+
110
+ puts "Environment: #{FactDb.config.environment}"
111
+ puts "Database: #{db.name}"
112
+
113
+ schema_file = File.expand_path("db/schema.sql", __dir__)
114
+ host = db.host || "localhost"
115
+ port = db.port || 5432
116
+ user = db.username || ENV["USER"]
117
+
118
+ cmd = ["pg_dump", "--schema-only", "--no-owner", "--no-acl"]
119
+ cmd += ["-h", host, "-p", port.to_s]
120
+ cmd += ["-U", user] if user
121
+ cmd << db.name
122
+
123
+ File.open(schema_file, "w") do |f|
124
+ f.puts "-- Schema dump for #{db.name}"
125
+ f.puts "-- Generated: #{Time.now.utc.iso8601}"
126
+ f.puts "-- Environment: #{FactDb.config.environment}"
127
+ f.puts
128
+ end
129
+
130
+ system(*cmd, out: [schema_file, "a"]) || abort("pg_dump failed")
131
+ puts "Schema dumped to #{schema_file}"
132
+ end
133
+
134
+ desc "Load database schema from db/schema.sql"
135
+ task :load do
136
+ require_relative "lib/fact_db"
137
+ db = FactDb.config.database
138
+
139
+ puts "Environment: #{FactDb.config.environment}"
140
+ puts "Database: #{db.name}"
141
+
142
+ schema_file = File.expand_path("db/schema.sql", __dir__)
143
+ abort("Schema file not found: #{schema_file}") unless File.exist?(schema_file)
144
+
145
+ host = db.host || "localhost"
146
+ port = db.port || 5432
147
+ user = db.username || ENV["USER"]
148
+
149
+ cmd = ["psql", "-q"]
150
+ cmd += ["-h", host, "-p", port.to_s]
151
+ cmd += ["-U", user] if user
152
+ cmd += ["-d", db.name, "-f", schema_file]
153
+
154
+ system(*cmd) || abort("psql failed")
155
+ puts "Schema loaded from #{schema_file}"
156
+ end
157
+ end
158
+
159
+ desc "Dump database to file. DIR=path (default: .)"
160
+ task :dump do
161
+ require_relative "lib/fact_db"
162
+ db = FactDb.config.database
163
+
164
+ puts "Environment: #{FactDb.config.environment}"
165
+ puts "Database: #{db.name}"
166
+
167
+ dumps_dir = ENV["DIR"] || Dir.pwd
168
+ timestamp = Time.now.strftime("%Y%m%d_%H%M%S")
169
+ dump_file = File.join(dumps_dir, "fact_db_#{FactDb.config.environment}_#{timestamp}.dump")
170
+
171
+ host = db.host || "localhost"
172
+ port = db.port || 5432
173
+ user = db.username || ENV["USER"]
174
+
175
+ cmd = ["pg_dump", "-Fc", "--no-owner", "--no-acl"]
176
+ cmd += ["-h", host, "-p", port.to_s]
177
+ cmd += ["-U", user] if user
178
+ cmd += ["-f", dump_file]
179
+ cmd << db.name
180
+
181
+ system(*cmd) || abort("pg_dump failed")
182
+ puts "Database dumped to #{dump_file}"
183
+ end
184
+
185
+ desc "Restore database from dump file. DIR=path (default: .)"
186
+ task :restore do
187
+ require_relative "lib/fact_db"
188
+ db = FactDb.config.database
189
+
190
+ dumps_dir = ENV["DIR"] || Dir.pwd
191
+ pattern = File.join(dumps_dir, "fact_db_#{FactDb.config.environment}_*.dump")
192
+ dump_files = Dir.glob(pattern).sort
193
+ abort("No dump files found matching: #{pattern}") if dump_files.empty?
194
+
195
+ dump_file = if dump_files.size == 1
196
+ dump_files.first
197
+ else
198
+ puts "Available dump files:"
199
+ dump_files.each_with_index do |file, index|
200
+ puts " #{index + 1}. #{File.basename(file)}"
201
+ end
202
+ print "\nSelect file (1-#{dump_files.size}): "
203
+ choice = $stdin.gets.to_i
204
+ abort("Invalid selection") unless choice.between?(1, dump_files.size)
205
+ dump_files[choice - 1]
206
+ end
207
+
208
+ puts "Environment: #{FactDb.config.environment}"
209
+ puts "Database: #{db.name}"
210
+ puts "Restoring from: #{dump_file}"
211
+
212
+ host = db.host || "localhost"
213
+ port = db.port || 5432
214
+ user = db.username || ENV["USER"]
215
+
216
+ cmd = ["pg_restore", "--no-owner", "--no-acl", "-d", db.name]
217
+ cmd += ["-h", host, "-p", port.to_s]
218
+ cmd += ["-U", user] if user
219
+ cmd << dump_file
220
+
221
+ system(*cmd) || abort("pg_restore failed")
222
+ puts "Database restored from #{dump_file}"
223
+ end
224
+
225
+ desc "Clean up invalid aliases (pronouns, generic terms). Use EXECUTE=1 to apply changes."
226
+ task :cleanup_aliases do
227
+ require_relative "lib/fact_db"
228
+ puts "Environment: #{FactDb.config.environment}"
229
+ puts "Database: #{FactDb.config.database.name}"
230
+ FactDb::Database.establish_connection!
231
+
232
+ dry_run = ENV["EXECUTE"] != "1"
233
+ stats = { checked: 0, removed: 0 }
234
+
235
+ puts dry_run ? "\n=== DRY RUN ===" : "\n=== EXECUTING ==="
236
+ puts
237
+
238
+ FactDb::Models::Entity.not_merged.find_each do |entity|
239
+ entity.aliases.each do |alias_record|
240
+ stats[:checked] += 1
241
+ next if FactDb::Validation::AliasFilter.valid?(alias_record.name, name: entity.name)
242
+
243
+ reason = FactDb::Validation::AliasFilter.rejection_reason(alias_record.name, name: entity.name)
244
+ puts "#{entity.name}: removing \"#{alias_record.name}\" (#{reason})"
245
+ alias_record.destroy unless dry_run
246
+ stats[:removed] += 1
247
+ end
248
+ end
249
+
250
+ puts "\nChecked: #{stats[:checked]}, Removed: #{stats[:removed]}"
251
+ puts "\nRun with EXECUTE=1 to apply changes." if dry_run && stats[:removed] > 0
252
+ end
253
+ end
254
+
255
+ namespace :docs do
256
+ desc "Build mkdocs documentation site"
257
+ task :mkdocs do
258
+ output_dir = File.expand_path("site", __dir__)
259
+ system("mkdocs", "build", "--clean") || abort("mkdocs build failed")
260
+ puts "MkDocs site built to #{output_dir}"
261
+ end
262
+
263
+ desc "Build YARD API documentation"
264
+ task :yard do
265
+ output_dir = File.expand_path("doc", __dir__)
266
+ system("yard", "doc") || abort("yard doc failed")
267
+ puts "YARD documentation built to #{output_dir}"
268
+ end
269
+
270
+ desc "Build all documentation (mkdocs and YARD)"
271
+ task all: [:mkdocs, :yard]
39
272
  end
40
273
 
41
274
  task default: :test
@@ -3,5 +3,6 @@
3
3
  class EnableExtensions < ActiveRecord::Migration[7.0]
4
4
  def change
5
5
  enable_extension "vector" unless extension_enabled?("vector")
6
+ enable_extension "pg_trgm" unless extension_enabled?("pg_trgm")
6
7
  end
7
8
  end
@@ -0,0 +1,49 @@
1
+ # frozen_string_literal: true
2
+
3
+ class CreateSources < ActiveRecord::Migration[7.0]
4
+ def change
5
+ create_table :fact_db_sources, comment: "Stores immutable source content from which facts are extracted" do |t|
6
+ t.string :content_hash, null: false, limit: 64,
7
+ comment: "SHA-256 hash of content for deduplication and integrity verification"
8
+ t.string :kind, null: false, limit: 50,
9
+ comment: "Classification of content origin (e.g., email, document, webpage, transcript)"
10
+
11
+ t.text :content, null: false,
12
+ comment: "Original unmodified text content, preserved for audit and re-extraction"
13
+ t.string :title, limit: 500,
14
+ comment: "Human-readable title or subject line of the content"
15
+
16
+ t.text :source_uri,
17
+ comment: "URI identifying the original source location (URL, file path, message ID)"
18
+ t.jsonb :metadata, null: false, default: {},
19
+ comment: "Flexible metadata about the source (author, date, headers, etc.)"
20
+
21
+ t.vector :embedding, limit: 1536,
22
+ comment: "Vector embedding for semantic similarity search (OpenAI ada-002 compatible)"
23
+
24
+ t.timestamptz :captured_at, null: false,
25
+ comment: "When the content was originally captured or received"
26
+ t.timestamps
27
+ end
28
+
29
+ add_index :fact_db_sources, :content_hash, unique: true
30
+ add_index :fact_db_sources, :captured_at
31
+ add_index :fact_db_sources, :kind
32
+ add_index :fact_db_sources, :metadata, using: :gin
33
+
34
+ # Full-text search index
35
+ execute <<-SQL
36
+ CREATE INDEX idx_sources_fulltext ON fact_db_sources
37
+ USING gin(to_tsvector('english', content));
38
+ SQL
39
+
40
+ # HNSW index for vector similarity search
41
+ execute <<-SQL
42
+ CREATE INDEX idx_sources_embedding ON fact_db_sources
43
+ USING hnsw (embedding vector_cosine_ops);
44
+ SQL
45
+
46
+ execute "COMMENT ON COLUMN fact_db_sources.created_at IS 'When this record was created in the database';"
47
+ execute "COMMENT ON COLUMN fact_db_sources.updated_at IS 'When this record was last modified';"
48
+ end
49
+ end
@@ -2,35 +2,47 @@
2
2
 
3
3
  class CreateEntities < ActiveRecord::Migration[7.0]
4
4
  def change
5
- create_table :fact_db_entities do |t|
6
- # Identity
7
- t.string :canonical_name, null: false, limit: 500
8
- t.string :entity_type, null: false, limit: 50
5
+ create_table :fact_db_entities, comment: "Canonical representations of people, organizations, places, and other named entities" do |t|
6
+ t.string :name, null: false, limit: 500,
7
+ comment: "Authoritative name for this entity after resolution and normalization"
8
+ t.string :kind, null: false, limit: 50,
9
+ comment: "Classification of entity (person, organization, location, product, event, etc.)"
9
10
 
10
- # Resolution metadata
11
- t.string :resolution_status, null: false, default: "unresolved", limit: 20
12
- t.bigint :merged_into_id
11
+ t.string :resolution_status, null: false, default: "unresolved", limit: 20,
12
+ comment: "Entity resolution state: unresolved, resolved, merged, or ambiguous"
13
+ t.bigint :canonical_id,
14
+ comment: "Reference to canonical entity if this entity was merged as a duplicate"
13
15
 
14
- # Descriptive metadata
15
- t.text :description
16
- t.jsonb :metadata, null: false, default: {}
16
+ t.text :description,
17
+ comment: "Human-readable description providing context about this entity"
18
+ t.jsonb :metadata, null: false, default: {},
19
+ comment: "Flexible attributes specific to entity type (titles, roles, identifiers, etc.)"
17
20
 
18
- # Vector embedding for semantic matching
19
- t.vector :embedding, limit: 1536
21
+ t.vector :embedding, limit: 1536,
22
+ comment: "Vector embedding for semantic entity matching and similarity search"
20
23
 
21
24
  t.timestamps
22
25
  end
23
26
 
24
- add_index :fact_db_entities, :canonical_name
25
- add_index :fact_db_entities, :entity_type
27
+ add_index :fact_db_entities, :name
28
+ add_index :fact_db_entities, :kind
26
29
  add_index :fact_db_entities, :resolution_status
27
30
  add_foreign_key :fact_db_entities, :fact_db_entities,
28
- column: :merged_into_id, on_delete: :nullify
31
+ column: :canonical_id, on_delete: :nullify
29
32
 
30
33
  # HNSW index for vector similarity search
31
34
  execute <<-SQL
32
35
  CREATE INDEX idx_entities_embedding ON fact_db_entities
33
36
  USING hnsw (embedding vector_cosine_ops);
34
37
  SQL
38
+
39
+ # GIN trigram index on name for fast fuzzy matching
40
+ execute <<-SQL
41
+ CREATE INDEX idx_entities_name_trgm ON fact_db_entities
42
+ USING gin (name gin_trgm_ops);
43
+ SQL
44
+
45
+ execute "COMMENT ON COLUMN fact_db_entities.created_at IS 'When this entity was first identified';"
46
+ execute "COMMENT ON COLUMN fact_db_entities.updated_at IS 'When this entity record was last modified';"
35
47
  end
36
48
  end