fact_db 0.0.2 → 0.0.3
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/.envrc +2 -0
- data/.yardopts +5 -0
- data/CHANGELOG.md +64 -0
- data/README.md +107 -6
- data/Rakefile +243 -10
- data/db/migrate/001_enable_extensions.rb +1 -0
- data/db/migrate/002_create_sources.rb +49 -0
- data/db/migrate/003_create_entities.rb +27 -15
- data/db/migrate/004_create_entity_aliases.rb +20 -7
- data/db/migrate/005_create_facts.rb +37 -21
- data/db/migrate/006_create_entity_mentions.rb +14 -6
- data/db/migrate/007_create_fact_sources.rb +16 -8
- data/docs/api/extractors/index.md +5 -5
- data/docs/api/extractors/llm.md +17 -17
- data/docs/api/extractors/rule-based.md +14 -14
- data/docs/api/facts.md +20 -20
- data/docs/api/index.md +4 -4
- data/docs/api/models/entity.md +21 -21
- data/docs/api/models/fact.md +15 -15
- data/docs/api/models/index.md +7 -7
- data/docs/api/models/{content.md → source.md} +29 -29
- data/docs/api/pipeline/extraction.md +25 -25
- data/docs/api/pipeline/index.md +1 -1
- data/docs/api/pipeline/resolution.md +4 -4
- data/docs/api/services/entity-service.md +20 -20
- data/docs/api/services/fact-service.md +12 -12
- data/docs/api/services/index.md +5 -5
- data/docs/api/services/{content-service.md → source-service.md} +27 -27
- data/docs/architecture/database-schema.md +46 -46
- data/docs/architecture/entity-resolution.md +6 -6
- data/docs/architecture/index.md +10 -10
- data/docs/architecture/temporal-facts.md +5 -5
- data/docs/architecture/three-layer-model.md +17 -17
- data/docs/concepts.md +6 -6
- data/docs/examples/basic-usage.md +20 -20
- data/docs/examples/hr-onboarding.md +17 -17
- data/docs/examples/index.md +4 -4
- data/docs/examples/news-analysis.md +23 -23
- data/docs/getting-started/database-setup.md +28 -20
- data/docs/getting-started/index.md +3 -3
- data/docs/getting-started/quick-start.md +33 -30
- data/docs/guides/batch-processing.md +26 -26
- data/docs/guides/configuration.md +158 -77
- data/docs/guides/entity-management.md +14 -14
- data/docs/guides/extracting-facts.md +28 -28
- data/docs/guides/ingesting-content.md +14 -14
- data/docs/guides/llm-integration.md +40 -32
- data/docs/guides/temporal-queries.md +11 -11
- data/docs/index.md +6 -2
- data/examples/.envrc +4 -0
- data/examples/.gitignore +1 -0
- data/examples/001_configuration.rb +312 -0
- data/examples/{basic_usage.rb → 010_basic_usage.rb} +47 -56
- data/examples/{entity_management.rb → 020_entity_management.rb} +57 -72
- data/examples/{temporal_queries.rb → 030_temporal_queries.rb} +39 -59
- data/examples/040_output_formats.rb +177 -0
- data/examples/{rule_based_extraction.rb → 050_rule_based_extraction.rb} +39 -45
- data/examples/060_fluent_temporal_api.rb +217 -0
- data/examples/070_introspection.rb +252 -0
- data/examples/{hr_system.rb → 080_hr_system.rb} +56 -75
- data/examples/090_ingest_demo.rb +515 -0
- data/examples/100_query_context.rb +668 -0
- data/examples/110_prove_it.rb +204 -0
- data/examples/120_dump_database.rb +358 -0
- data/examples/130_rag_feedback_loop.rb +858 -0
- data/examples/README.md +229 -15
- data/examples/data/lincoln_associates.md +201 -0
- data/examples/data/lincoln_biography.md +66 -0
- data/examples/data/lincoln_cabinet.md +243 -0
- data/examples/data/lincoln_family.md +163 -0
- data/examples/data/lincoln_military.md +241 -0
- data/examples/data/lincoln_todd_family.md +136 -0
- data/examples/ingest_reporter.rb +335 -0
- data/examples/utilities.rb +182 -0
- data/lib/fact_db/config/defaults.yml +254 -0
- data/lib/fact_db/config.rb +94 -35
- data/lib/fact_db/database.rb +98 -8
- data/lib/fact_db/extractors/base.rb +106 -21
- data/lib/fact_db/extractors/llm_extractor.rb +35 -63
- data/lib/fact_db/extractors/manual_extractor.rb +46 -6
- data/lib/fact_db/extractors/rule_based_extractor.rb +136 -25
- data/lib/fact_db/llm/adapter.rb +3 -3
- data/lib/fact_db/models/entity.rb +94 -22
- data/lib/fact_db/models/entity_alias.rb +41 -7
- data/lib/fact_db/models/entity_mention.rb +34 -1
- data/lib/fact_db/models/fact.rb +259 -28
- data/lib/fact_db/models/fact_source.rb +43 -9
- data/lib/fact_db/models/source.rb +113 -0
- data/lib/fact_db/pipeline/extraction_pipeline.rb +35 -35
- data/lib/fact_db/pipeline/resolution_pipeline.rb +5 -5
- data/lib/fact_db/query_result.rb +202 -0
- data/lib/fact_db/resolution/entity_resolver.rb +139 -39
- data/lib/fact_db/resolution/fact_resolver.rb +86 -14
- data/lib/fact_db/services/entity_service.rb +246 -37
- data/lib/fact_db/services/fact_service.rb +254 -17
- data/lib/fact_db/services/source_service.rb +164 -0
- data/lib/fact_db/temporal/query.rb +71 -7
- data/lib/fact_db/temporal/query_builder.rb +69 -0
- data/lib/fact_db/temporal/timeline.rb +102 -11
- data/lib/fact_db/transformers/base.rb +77 -0
- data/lib/fact_db/transformers/cypher_transformer.rb +185 -0
- data/lib/fact_db/transformers/json_transformer.rb +17 -0
- data/lib/fact_db/transformers/raw_transformer.rb +35 -0
- data/lib/fact_db/transformers/text_transformer.rb +114 -0
- data/lib/fact_db/transformers/triple_transformer.rb +138 -0
- data/lib/fact_db/validation/alias_filter.rb +185 -0
- data/lib/fact_db/version.rb +1 -1
- data/lib/fact_db.rb +281 -30
- data/mkdocs.yml +2 -2
- metadata +60 -16
- data/db/migrate/002_create_contents.rb +0 -44
- data/lib/fact_db/models/content.rb +0 -62
- data/lib/fact_db/services/content_service.rb +0 -93
checksums.yaml
CHANGED
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
---
|
|
2
2
|
SHA256:
|
|
3
|
-
metadata.gz:
|
|
4
|
-
data.tar.gz:
|
|
3
|
+
metadata.gz: c9a22512c569e81df1cd3e7d216dc1356c043c8e454e631df14d3fadffd13b39
|
|
4
|
+
data.tar.gz: 260f0183ffc6a7166d2111a953215c7835ab3fd47e213474d2eda66d2f8ea582
|
|
5
5
|
SHA512:
|
|
6
|
-
metadata.gz:
|
|
7
|
-
data.tar.gz:
|
|
6
|
+
metadata.gz: 539a7bef88cb16f6f590d6227a5b0820347197d679cc187c9e34990381f5b6a550ed60b1733d829da1b5c0e6f67e3b0dc7bc0d9bfda606acad282914c7db9fbe
|
|
7
|
+
data.tar.gz: 4c7e4b859af803c853cd2ce7d1a7aeba7c0d45f34934483d2caacd984eae29f6cf783803de19be071ca51f0b5df41b0e14bdf287a0d4815e42b3e5b2cedb7131
|
data/.envrc
CHANGED
data/.yardopts
ADDED
data/CHANGELOG.md
CHANGED
|
@@ -8,6 +8,70 @@ All notable changes to this project will be documented in this file.
|
|
|
8
8
|
The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.1.0/),
|
|
9
9
|
and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html).
|
|
10
10
|
|
|
11
|
+
## [0.0.3] - 2026-01-12
|
|
12
|
+
|
|
13
|
+
### Added
|
|
14
|
+
|
|
15
|
+
- **Rake Tasks** - New database and documentation tasks
|
|
16
|
+
- `db:dump` - Dump database to file with timestamped naming convention
|
|
17
|
+
- `db:restore` - Restore database from dump file with interactive selection
|
|
18
|
+
- `db:schema:dump` - Dump database schema to `db/schema.sql`
|
|
19
|
+
- `db:schema:load` - Load database schema from `db/schema.sql`
|
|
20
|
+
- `docs:mkdocs` - Build MkDocs documentation site
|
|
21
|
+
- `docs:yard` - Build YARD API documentation
|
|
22
|
+
- `docs:all` - Build all documentation
|
|
23
|
+
- **YARD Documentation** - API documentation with GitHub Pages deployment
|
|
24
|
+
- Added `.yardopts` configuration
|
|
25
|
+
- Added GitHub Actions workflow for YARD deployment
|
|
26
|
+
- YARD docs available at `/yard` subdirectory on GitHub Pages
|
|
27
|
+
- **Trigram Search** - Added pg_trgm extension for fuzzy text matching
|
|
28
|
+
- **RAG Feedback Loop Example** - New example demonstrating retrieval-augmented generation patterns
|
|
29
|
+
- **Output Transformers** - Transform query results into multiple formats optimized for LLM consumption
|
|
30
|
+
- `RawTransformer` - Returns original ActiveRecord objects unchanged for direct database access
|
|
31
|
+
- `JsonTransformer` - JSON-serializable hash format (default)
|
|
32
|
+
- `TripleTransformer` - Subject-Predicate-Object triples for semantic encoding
|
|
33
|
+
- `CypherTransformer` - Cypher-like graph notation with nodes and relationships
|
|
34
|
+
- `TextTransformer` - Human-readable markdown format grouped by fact status
|
|
35
|
+
- **QueryResult** - Unified container for query results that works with all transformers
|
|
36
|
+
- Normalizes facts from ActiveRecord objects or hashes
|
|
37
|
+
- Resolves and caches entities referenced in facts
|
|
38
|
+
- Provides iteration methods (`each_fact`, `each_entity`)
|
|
39
|
+
- **Temporal Query Builder** - Fluent API for point-in-time queries via `facts.at(date)`
|
|
40
|
+
- Chain queries: `facts.at("2024-01-15").query("Paula's role", format: :cypher)`
|
|
41
|
+
- Get facts for entity: `facts.at("2024-01-15").facts_for(entity_id)`
|
|
42
|
+
- Compare dates: `facts.at("2024-01-15").compare_to("2024-06-15")`
|
|
43
|
+
- **Temporal Diff** - Compare what changed between two dates with `facts.diff(topic, from:, to:)`
|
|
44
|
+
- Returns `:added`, `:removed`, and `:unchanged` fact arrays
|
|
45
|
+
- **Introspection API** - Discover what the fact database knows about
|
|
46
|
+
- `facts.introspect` - Get schema, capabilities, entity types, and statistics
|
|
47
|
+
- `facts.introspect("Paula Chen")` - Get coverage and relationships for a topic
|
|
48
|
+
- `facts.suggest_queries(topic)` - Get suggested queries based on stored data
|
|
49
|
+
- `facts.suggest_strategies(query)` - Get recommended retrieval strategies
|
|
50
|
+
- **Format Parameter** - All query methods now accept `format:` parameter
|
|
51
|
+
- Available formats: `:raw`, `:json`, `:triples`, `:cypher`, `:text`
|
|
52
|
+
- Example: `facts.query_facts(topic: "Paula", format: :cypher)`
|
|
53
|
+
|
|
54
|
+
### Changed
|
|
55
|
+
|
|
56
|
+
- **Configuration** - Replaced `anyway_config` with `myway_config` for configuration management
|
|
57
|
+
- Added environment-specific configuration support
|
|
58
|
+
- `EntityService` now includes `relationship_types_for(entity_id)` and `timespan_for(entity_id)` methods
|
|
59
|
+
- `FactService` now includes `fact_stats(entity_id)` for per-entity statistics
|
|
60
|
+
|
|
61
|
+
### Breaking Changes
|
|
62
|
+
|
|
63
|
+
- **Database Schema Renames** - Multiple columns and tables renamed for consistency
|
|
64
|
+
- Table: `contents` → `sources`
|
|
65
|
+
- Column: `content_type` → `type` (in sources)
|
|
66
|
+
- Column: `source_metadata` → `metadata` (in sources)
|
|
67
|
+
- Column: `entity_type` → `type` (in entities)
|
|
68
|
+
- Column: `canonical_name` → `name` (in entities)
|
|
69
|
+
- Column: `merged_into_id` → `canonical_id` (in entities)
|
|
70
|
+
- Column: `alias_text` → `name` (in entity_aliases)
|
|
71
|
+
- Column: `alias_type` → `type` (in entity_aliases)
|
|
72
|
+
- Column: `fact_text` → `text` (in facts)
|
|
73
|
+
- **Terminology** - Replaced `type` with `kind` throughout the codebase for entity and content classification to avoid conflicts with Ruby's reserved `type` method
|
|
74
|
+
|
|
11
75
|
## [0.0.2] - 2025-01-08
|
|
12
76
|
|
|
13
77
|
### Fixed
|
data/README.md
CHANGED
|
@@ -19,7 +19,9 @@ FactDb implements the Event Clock concept - capturing organizational knowledge t
|
|
|
19
19
|
- <strong>Audit Trails</strong> - Every fact links back to source content<br>
|
|
20
20
|
- <strong>Multiple Extractors</strong> - Extract facts manually, via LLM, or rule-based<br>
|
|
21
21
|
- <strong>Semantic Search</strong> - PostgreSQL with pgvector<br>
|
|
22
|
-
- <strong>Concurrent Processing</strong> - Batch process with parallel pipelines
|
|
22
|
+
- <strong>Concurrent Processing</strong> - Batch process with parallel pipelines<br>
|
|
23
|
+
- <strong>Output Formats</strong> - JSON, triples, Cypher, or text for LLM consumption<br>
|
|
24
|
+
- <strong>Temporal Queries</strong> - Fluent API for point-in-time queries and diffs
|
|
23
25
|
</td>
|
|
24
26
|
</tr>
|
|
25
27
|
</table>
|
|
@@ -50,8 +52,10 @@ bundle install
|
|
|
50
52
|
require 'fact_db'
|
|
51
53
|
|
|
52
54
|
# Configure with a PostgreSQL database URL
|
|
55
|
+
# If you want to use an envar name different from the standard
|
|
56
|
+
# FDB_DATABASE__URL then you must set the config.database.url in code ...
|
|
53
57
|
FactDb.configure do |config|
|
|
54
|
-
config.
|
|
58
|
+
config.database.url = ENV["YOUR_DATABASE_URL_ENVAR_NAME"]
|
|
55
59
|
end
|
|
56
60
|
|
|
57
61
|
# Run migrations to create the schema (only needed once)
|
|
@@ -61,19 +65,27 @@ FactDb::Database.migrate!
|
|
|
61
65
|
facts = FactDb.new
|
|
62
66
|
```
|
|
63
67
|
|
|
68
|
+
Configuration uses nested sections. You can also use environment variables:
|
|
69
|
+
|
|
70
|
+
```bash
|
|
71
|
+
export FDB_DATABASE__URL="postgresql://localhost/fact_db"
|
|
72
|
+
export FDB_LLM__PROVIDER="openai"
|
|
73
|
+
export FDB_LLM__API_KEY="sk-..."
|
|
74
|
+
```
|
|
75
|
+
|
|
64
76
|
Once configured, you can ingest content and create facts:
|
|
65
77
|
|
|
66
78
|
```ruby
|
|
67
79
|
# Ingest content
|
|
68
80
|
content = facts.ingest(
|
|
69
81
|
"Paula Chen joined Microsoft as Principal Engineer on January 10, 2024.",
|
|
70
|
-
|
|
82
|
+
kind: :email,
|
|
71
83
|
captured_at: Time.now
|
|
72
84
|
)
|
|
73
85
|
|
|
74
86
|
# Create entities
|
|
75
|
-
paula = facts.entity_service.create("Paula Chen",
|
|
76
|
-
microsoft = facts.entity_service.create("Microsoft",
|
|
87
|
+
paula = facts.entity_service.create("Paula Chen", kind: :person)
|
|
88
|
+
microsoft = facts.entity_service.create("Microsoft", kind: :organization)
|
|
77
89
|
|
|
78
90
|
# Create a fact with entity mentions
|
|
79
91
|
facts.fact_service.create(
|
|
@@ -91,17 +103,106 @@ Query facts temporally:
|
|
|
91
103
|
```ruby
|
|
92
104
|
# Query current facts about Paula
|
|
93
105
|
facts.current_facts_for(paula.id).each do |fact|
|
|
94
|
-
puts fact.
|
|
106
|
+
puts fact.text
|
|
95
107
|
end
|
|
96
108
|
|
|
97
109
|
# Query facts at a point in time (before she joined)
|
|
98
110
|
facts.facts_at(Date.new(2023, 6, 15), entity: paula.id)
|
|
99
111
|
```
|
|
100
112
|
|
|
113
|
+
## Output Formats
|
|
114
|
+
|
|
115
|
+
Query results can be transformed into multiple formats for different use cases:
|
|
116
|
+
|
|
117
|
+
```ruby
|
|
118
|
+
# Raw - original ActiveRecord objects for direct database access
|
|
119
|
+
results = facts.query_facts(topic: "Paula Chen", format: :raw)
|
|
120
|
+
results.each do |fact|
|
|
121
|
+
puts fact.text
|
|
122
|
+
puts fact.entity_mentions.map(&:entity).map(&:name)
|
|
123
|
+
end
|
|
124
|
+
|
|
125
|
+
# JSON (default) - structured hash
|
|
126
|
+
facts.query_facts(topic: "Paula Chen", format: :json)
|
|
127
|
+
|
|
128
|
+
# Triples - Subject-Predicate-Object for semantic encoding
|
|
129
|
+
facts.query_facts(topic: "Paula Chen", format: :triples)
|
|
130
|
+
# => [["Paula Chen", "kind", "Person"],
|
|
131
|
+
# ["Paula Chen", "works_at", "Microsoft"],
|
|
132
|
+
# ["Paula Chen", "works_at.valid_from", "2024-01-10"]]
|
|
133
|
+
|
|
134
|
+
# Cypher - graph notation with nodes and relationships
|
|
135
|
+
facts.query_facts(topic: "Paula Chen", format: :cypher)
|
|
136
|
+
# => (paula_chen:Person {name: "Paula Chen"})
|
|
137
|
+
# (microsoft:Organization {name: "Microsoft"})
|
|
138
|
+
# (paula_chen)-[:WORKS_AT {since: "2024-01-10"}]->(microsoft)
|
|
139
|
+
|
|
140
|
+
# Text - human-readable markdown
|
|
141
|
+
facts.query_facts(topic: "Paula Chen", format: :text)
|
|
142
|
+
```
|
|
143
|
+
|
|
144
|
+
## Temporal Query Builder
|
|
145
|
+
|
|
146
|
+
Use the fluent API for point-in-time queries:
|
|
147
|
+
|
|
148
|
+
```ruby
|
|
149
|
+
# Query at a specific date
|
|
150
|
+
facts.at("2024-01-15").query("Paula's role", format: :cypher)
|
|
151
|
+
|
|
152
|
+
# Get all facts valid at a date
|
|
153
|
+
facts.at("2024-01-15").facts
|
|
154
|
+
|
|
155
|
+
# Get facts for a specific entity at that date
|
|
156
|
+
facts.at("2024-01-15").facts_for(paula.id)
|
|
157
|
+
|
|
158
|
+
# Compare what changed between two dates
|
|
159
|
+
facts.at("2024-01-15").compare_to("2024-06-15")
|
|
160
|
+
```
|
|
161
|
+
|
|
162
|
+
## Comparing Changes Over Time
|
|
163
|
+
|
|
164
|
+
Track what changed between two points in time:
|
|
165
|
+
|
|
166
|
+
```ruby
|
|
167
|
+
diff = facts.diff("Paula Chen", from: "2024-01-01", to: "2024-06-01")
|
|
168
|
+
|
|
169
|
+
diff[:added] # Facts that became valid
|
|
170
|
+
diff[:removed] # Facts that were superseded
|
|
171
|
+
diff[:unchanged] # Facts that remained valid
|
|
172
|
+
```
|
|
173
|
+
|
|
174
|
+
## Introspection
|
|
175
|
+
|
|
176
|
+
Discover what the fact database knows about:
|
|
177
|
+
|
|
178
|
+
```ruby
|
|
179
|
+
# Get schema and capabilities
|
|
180
|
+
facts.introspect
|
|
181
|
+
# => { capabilities: [:temporal_query, :entity_resolution, ...],
|
|
182
|
+
# entity_kinds: ["person", "organization", ...],
|
|
183
|
+
# output_formats: [:raw, :json, :triples, :cypher, :text],
|
|
184
|
+
# statistics: { facts: {...}, entities: {...} } }
|
|
185
|
+
|
|
186
|
+
# Get coverage for a specific topic
|
|
187
|
+
facts.introspect("Paula Chen")
|
|
188
|
+
# => { entity: {...}, coverage: {...}, relationships: [...],
|
|
189
|
+
# suggested_queries: ["current status", "employment history"] }
|
|
190
|
+
|
|
191
|
+
# Get query suggestions
|
|
192
|
+
facts.suggest_queries("Paula Chen")
|
|
193
|
+
# => ["current status", "employment history", "timeline"]
|
|
194
|
+
|
|
195
|
+
# Get retrieval strategy recommendations
|
|
196
|
+
facts.suggest_strategies("What happened last week?")
|
|
197
|
+
# => [{ strategy: :temporal, description: "Filter by date range" }]
|
|
198
|
+
```
|
|
199
|
+
|
|
101
200
|
## Documentation
|
|
102
201
|
|
|
103
202
|
Full documentation is available at **[https://madbomber.github.io/fact_db](https://madbomber.github.io/fact_db)**
|
|
104
203
|
|
|
204
|
+
API documentation (YARD) is available at **[https://madbomber.github.io/fact_db/yard](https://madbomber.github.io/fact_db/yard)**
|
|
205
|
+
|
|
105
206
|
## Examples
|
|
106
207
|
|
|
107
208
|
See the [examples directory](examples/README.md) for runnable demo programs covering:
|
data/Rakefile
CHANGED
|
@@ -9,33 +9,266 @@ Rake::TestTask.new(:test) do |t|
|
|
|
9
9
|
t.test_files = FileList["test/**/*_test.rb"]
|
|
10
10
|
end
|
|
11
11
|
|
|
12
|
+
# Ensure test environment is set before running tests
|
|
13
|
+
task :set_test_env do
|
|
14
|
+
ENV["FDB_ENV"] = "test"
|
|
15
|
+
end
|
|
16
|
+
|
|
17
|
+
task test: [:set_test_env, "db:reset:test"]
|
|
18
|
+
|
|
12
19
|
namespace :db do
|
|
20
|
+
desc "Drop the database"
|
|
21
|
+
task :drop do
|
|
22
|
+
require_relative "lib/fact_db"
|
|
23
|
+
puts "Environment: #{FactDb.config.environment}"
|
|
24
|
+
puts "Database: #{FactDb.config.database.name}"
|
|
25
|
+
FactDb::Database.drop!
|
|
26
|
+
end
|
|
27
|
+
|
|
28
|
+
desc "Create the database"
|
|
29
|
+
task :create do
|
|
30
|
+
require_relative "lib/fact_db"
|
|
31
|
+
puts "Environment: #{FactDb.config.environment}"
|
|
32
|
+
puts "Database: #{FactDb.config.database.name}"
|
|
33
|
+
FactDb::Database.create!
|
|
34
|
+
end
|
|
35
|
+
|
|
13
36
|
desc "Run database migrations"
|
|
14
37
|
task :migrate do
|
|
15
38
|
require_relative "lib/fact_db"
|
|
16
|
-
FactDb.
|
|
17
|
-
|
|
18
|
-
end
|
|
39
|
+
puts "Environment: #{FactDb.config.environment}"
|
|
40
|
+
puts "Database: #{FactDb.config.database.name}"
|
|
19
41
|
FactDb::Database.migrate!
|
|
20
42
|
end
|
|
21
43
|
|
|
22
44
|
desc "Rollback the last migration"
|
|
23
45
|
task :rollback do
|
|
24
46
|
require_relative "lib/fact_db"
|
|
25
|
-
FactDb.
|
|
26
|
-
|
|
27
|
-
end
|
|
47
|
+
puts "Environment: #{FactDb.config.environment}"
|
|
48
|
+
puts "Database: #{FactDb.config.database.name}"
|
|
28
49
|
FactDb::Database.rollback!
|
|
29
50
|
end
|
|
30
51
|
|
|
31
|
-
desc "Reset the database (drop, create, migrate)"
|
|
52
|
+
desc "Reset the database (drop, create, migrate) - honors FDB_ENV"
|
|
32
53
|
task :reset do
|
|
33
54
|
require_relative "lib/fact_db"
|
|
34
|
-
FactDb.
|
|
35
|
-
|
|
36
|
-
end
|
|
55
|
+
puts "Environment: #{FactDb.config.environment}"
|
|
56
|
+
puts "Database: #{FactDb.config.database.name}"
|
|
37
57
|
FactDb::Database.reset!
|
|
38
58
|
end
|
|
59
|
+
|
|
60
|
+
namespace :reset do
|
|
61
|
+
def reset_for_environment(env_name)
|
|
62
|
+
original_env = ENV["FDB_ENV"]
|
|
63
|
+
ENV["FDB_ENV"] = env_name
|
|
64
|
+
|
|
65
|
+
require_relative "lib/fact_db"
|
|
66
|
+
Anyway::Settings.current_environment = env_name
|
|
67
|
+
FactDb.reset_configuration!
|
|
68
|
+
|
|
69
|
+
puts "Environment: #{FactDb.config.environment}"
|
|
70
|
+
puts "Database: #{FactDb.config.database.name}"
|
|
71
|
+
FactDb::Database.reset!
|
|
72
|
+
ensure
|
|
73
|
+
ENV["FDB_ENV"] = original_env
|
|
74
|
+
Anyway::Settings.current_environment = original_env || "development"
|
|
75
|
+
FactDb.reset_configuration!
|
|
76
|
+
end
|
|
77
|
+
|
|
78
|
+
desc "Reset development database"
|
|
79
|
+
task :development do
|
|
80
|
+
reset_for_environment("development")
|
|
81
|
+
end
|
|
82
|
+
|
|
83
|
+
desc "Reset test database"
|
|
84
|
+
task :test do
|
|
85
|
+
reset_for_environment("test")
|
|
86
|
+
end
|
|
87
|
+
|
|
88
|
+
desc "Reset demo database"
|
|
89
|
+
task :demo do
|
|
90
|
+
reset_for_environment("demo")
|
|
91
|
+
end
|
|
92
|
+
|
|
93
|
+
desc "Reset all databases (development, test, demo)"
|
|
94
|
+
task :all do
|
|
95
|
+
%w[development test demo].each do |env_name|
|
|
96
|
+
puts "\n#{"=" * 50}"
|
|
97
|
+
reset_for_environment(env_name)
|
|
98
|
+
end
|
|
99
|
+
puts "\n#{"=" * 50}"
|
|
100
|
+
puts "All databases reset."
|
|
101
|
+
end
|
|
102
|
+
end
|
|
103
|
+
|
|
104
|
+
namespace :schema do
|
|
105
|
+
desc "Dump database schema to db/schema.sql"
|
|
106
|
+
task :dump do
|
|
107
|
+
require_relative "lib/fact_db"
|
|
108
|
+
db = FactDb.config.database
|
|
109
|
+
|
|
110
|
+
puts "Environment: #{FactDb.config.environment}"
|
|
111
|
+
puts "Database: #{db.name}"
|
|
112
|
+
|
|
113
|
+
schema_file = File.expand_path("db/schema.sql", __dir__)
|
|
114
|
+
host = db.host || "localhost"
|
|
115
|
+
port = db.port || 5432
|
|
116
|
+
user = db.username || ENV["USER"]
|
|
117
|
+
|
|
118
|
+
cmd = ["pg_dump", "--schema-only", "--no-owner", "--no-acl"]
|
|
119
|
+
cmd += ["-h", host, "-p", port.to_s]
|
|
120
|
+
cmd += ["-U", user] if user
|
|
121
|
+
cmd << db.name
|
|
122
|
+
|
|
123
|
+
File.open(schema_file, "w") do |f|
|
|
124
|
+
f.puts "-- Schema dump for #{db.name}"
|
|
125
|
+
f.puts "-- Generated: #{Time.now.utc.iso8601}"
|
|
126
|
+
f.puts "-- Environment: #{FactDb.config.environment}"
|
|
127
|
+
f.puts
|
|
128
|
+
end
|
|
129
|
+
|
|
130
|
+
system(*cmd, out: [schema_file, "a"]) || abort("pg_dump failed")
|
|
131
|
+
puts "Schema dumped to #{schema_file}"
|
|
132
|
+
end
|
|
133
|
+
|
|
134
|
+
desc "Load database schema from db/schema.sql"
|
|
135
|
+
task :load do
|
|
136
|
+
require_relative "lib/fact_db"
|
|
137
|
+
db = FactDb.config.database
|
|
138
|
+
|
|
139
|
+
puts "Environment: #{FactDb.config.environment}"
|
|
140
|
+
puts "Database: #{db.name}"
|
|
141
|
+
|
|
142
|
+
schema_file = File.expand_path("db/schema.sql", __dir__)
|
|
143
|
+
abort("Schema file not found: #{schema_file}") unless File.exist?(schema_file)
|
|
144
|
+
|
|
145
|
+
host = db.host || "localhost"
|
|
146
|
+
port = db.port || 5432
|
|
147
|
+
user = db.username || ENV["USER"]
|
|
148
|
+
|
|
149
|
+
cmd = ["psql", "-q"]
|
|
150
|
+
cmd += ["-h", host, "-p", port.to_s]
|
|
151
|
+
cmd += ["-U", user] if user
|
|
152
|
+
cmd += ["-d", db.name, "-f", schema_file]
|
|
153
|
+
|
|
154
|
+
system(*cmd) || abort("psql failed")
|
|
155
|
+
puts "Schema loaded from #{schema_file}"
|
|
156
|
+
end
|
|
157
|
+
end
|
|
158
|
+
|
|
159
|
+
desc "Dump database to file. DIR=path (default: .)"
|
|
160
|
+
task :dump do
|
|
161
|
+
require_relative "lib/fact_db"
|
|
162
|
+
db = FactDb.config.database
|
|
163
|
+
|
|
164
|
+
puts "Environment: #{FactDb.config.environment}"
|
|
165
|
+
puts "Database: #{db.name}"
|
|
166
|
+
|
|
167
|
+
dumps_dir = ENV["DIR"] || Dir.pwd
|
|
168
|
+
timestamp = Time.now.strftime("%Y%m%d_%H%M%S")
|
|
169
|
+
dump_file = File.join(dumps_dir, "fact_db_#{FactDb.config.environment}_#{timestamp}.dump")
|
|
170
|
+
|
|
171
|
+
host = db.host || "localhost"
|
|
172
|
+
port = db.port || 5432
|
|
173
|
+
user = db.username || ENV["USER"]
|
|
174
|
+
|
|
175
|
+
cmd = ["pg_dump", "-Fc", "--no-owner", "--no-acl"]
|
|
176
|
+
cmd += ["-h", host, "-p", port.to_s]
|
|
177
|
+
cmd += ["-U", user] if user
|
|
178
|
+
cmd += ["-f", dump_file]
|
|
179
|
+
cmd << db.name
|
|
180
|
+
|
|
181
|
+
system(*cmd) || abort("pg_dump failed")
|
|
182
|
+
puts "Database dumped to #{dump_file}"
|
|
183
|
+
end
|
|
184
|
+
|
|
185
|
+
desc "Restore database from dump file. DIR=path (default: .)"
|
|
186
|
+
task :restore do
|
|
187
|
+
require_relative "lib/fact_db"
|
|
188
|
+
db = FactDb.config.database
|
|
189
|
+
|
|
190
|
+
dumps_dir = ENV["DIR"] || Dir.pwd
|
|
191
|
+
pattern = File.join(dumps_dir, "fact_db_#{FactDb.config.environment}_*.dump")
|
|
192
|
+
dump_files = Dir.glob(pattern).sort
|
|
193
|
+
abort("No dump files found matching: #{pattern}") if dump_files.empty?
|
|
194
|
+
|
|
195
|
+
dump_file = if dump_files.size == 1
|
|
196
|
+
dump_files.first
|
|
197
|
+
else
|
|
198
|
+
puts "Available dump files:"
|
|
199
|
+
dump_files.each_with_index do |file, index|
|
|
200
|
+
puts " #{index + 1}. #{File.basename(file)}"
|
|
201
|
+
end
|
|
202
|
+
print "\nSelect file (1-#{dump_files.size}): "
|
|
203
|
+
choice = $stdin.gets.to_i
|
|
204
|
+
abort("Invalid selection") unless choice.between?(1, dump_files.size)
|
|
205
|
+
dump_files[choice - 1]
|
|
206
|
+
end
|
|
207
|
+
|
|
208
|
+
puts "Environment: #{FactDb.config.environment}"
|
|
209
|
+
puts "Database: #{db.name}"
|
|
210
|
+
puts "Restoring from: #{dump_file}"
|
|
211
|
+
|
|
212
|
+
host = db.host || "localhost"
|
|
213
|
+
port = db.port || 5432
|
|
214
|
+
user = db.username || ENV["USER"]
|
|
215
|
+
|
|
216
|
+
cmd = ["pg_restore", "--no-owner", "--no-acl", "-d", db.name]
|
|
217
|
+
cmd += ["-h", host, "-p", port.to_s]
|
|
218
|
+
cmd += ["-U", user] if user
|
|
219
|
+
cmd << dump_file
|
|
220
|
+
|
|
221
|
+
system(*cmd) || abort("pg_restore failed")
|
|
222
|
+
puts "Database restored from #{dump_file}"
|
|
223
|
+
end
|
|
224
|
+
|
|
225
|
+
desc "Clean up invalid aliases (pronouns, generic terms). Use EXECUTE=1 to apply changes."
|
|
226
|
+
task :cleanup_aliases do
|
|
227
|
+
require_relative "lib/fact_db"
|
|
228
|
+
puts "Environment: #{FactDb.config.environment}"
|
|
229
|
+
puts "Database: #{FactDb.config.database.name}"
|
|
230
|
+
FactDb::Database.establish_connection!
|
|
231
|
+
|
|
232
|
+
dry_run = ENV["EXECUTE"] != "1"
|
|
233
|
+
stats = { checked: 0, removed: 0 }
|
|
234
|
+
|
|
235
|
+
puts dry_run ? "\n=== DRY RUN ===" : "\n=== EXECUTING ==="
|
|
236
|
+
puts
|
|
237
|
+
|
|
238
|
+
FactDb::Models::Entity.not_merged.find_each do |entity|
|
|
239
|
+
entity.aliases.each do |alias_record|
|
|
240
|
+
stats[:checked] += 1
|
|
241
|
+
next if FactDb::Validation::AliasFilter.valid?(alias_record.name, name: entity.name)
|
|
242
|
+
|
|
243
|
+
reason = FactDb::Validation::AliasFilter.rejection_reason(alias_record.name, name: entity.name)
|
|
244
|
+
puts "#{entity.name}: removing \"#{alias_record.name}\" (#{reason})"
|
|
245
|
+
alias_record.destroy unless dry_run
|
|
246
|
+
stats[:removed] += 1
|
|
247
|
+
end
|
|
248
|
+
end
|
|
249
|
+
|
|
250
|
+
puts "\nChecked: #{stats[:checked]}, Removed: #{stats[:removed]}"
|
|
251
|
+
puts "\nRun with EXECUTE=1 to apply changes." if dry_run && stats[:removed] > 0
|
|
252
|
+
end
|
|
253
|
+
end
|
|
254
|
+
|
|
255
|
+
namespace :docs do
|
|
256
|
+
desc "Build mkdocs documentation site"
|
|
257
|
+
task :mkdocs do
|
|
258
|
+
output_dir = File.expand_path("site", __dir__)
|
|
259
|
+
system("mkdocs", "build", "--clean") || abort("mkdocs build failed")
|
|
260
|
+
puts "MkDocs site built to #{output_dir}"
|
|
261
|
+
end
|
|
262
|
+
|
|
263
|
+
desc "Build YARD API documentation"
|
|
264
|
+
task :yard do
|
|
265
|
+
output_dir = File.expand_path("doc", __dir__)
|
|
266
|
+
system("yard", "doc") || abort("yard doc failed")
|
|
267
|
+
puts "YARD documentation built to #{output_dir}"
|
|
268
|
+
end
|
|
269
|
+
|
|
270
|
+
desc "Build all documentation (mkdocs and YARD)"
|
|
271
|
+
task all: [:mkdocs, :yard]
|
|
39
272
|
end
|
|
40
273
|
|
|
41
274
|
task default: :test
|
|
@@ -0,0 +1,49 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
class CreateSources < ActiveRecord::Migration[7.0]
|
|
4
|
+
def change
|
|
5
|
+
create_table :fact_db_sources, comment: "Stores immutable source content from which facts are extracted" do |t|
|
|
6
|
+
t.string :content_hash, null: false, limit: 64,
|
|
7
|
+
comment: "SHA-256 hash of content for deduplication and integrity verification"
|
|
8
|
+
t.string :kind, null: false, limit: 50,
|
|
9
|
+
comment: "Classification of content origin (e.g., email, document, webpage, transcript)"
|
|
10
|
+
|
|
11
|
+
t.text :content, null: false,
|
|
12
|
+
comment: "Original unmodified text content, preserved for audit and re-extraction"
|
|
13
|
+
t.string :title, limit: 500,
|
|
14
|
+
comment: "Human-readable title or subject line of the content"
|
|
15
|
+
|
|
16
|
+
t.text :source_uri,
|
|
17
|
+
comment: "URI identifying the original source location (URL, file path, message ID)"
|
|
18
|
+
t.jsonb :metadata, null: false, default: {},
|
|
19
|
+
comment: "Flexible metadata about the source (author, date, headers, etc.)"
|
|
20
|
+
|
|
21
|
+
t.vector :embedding, limit: 1536,
|
|
22
|
+
comment: "Vector embedding for semantic similarity search (OpenAI ada-002 compatible)"
|
|
23
|
+
|
|
24
|
+
t.timestamptz :captured_at, null: false,
|
|
25
|
+
comment: "When the content was originally captured or received"
|
|
26
|
+
t.timestamps
|
|
27
|
+
end
|
|
28
|
+
|
|
29
|
+
add_index :fact_db_sources, :content_hash, unique: true
|
|
30
|
+
add_index :fact_db_sources, :captured_at
|
|
31
|
+
add_index :fact_db_sources, :kind
|
|
32
|
+
add_index :fact_db_sources, :metadata, using: :gin
|
|
33
|
+
|
|
34
|
+
# Full-text search index
|
|
35
|
+
execute <<-SQL
|
|
36
|
+
CREATE INDEX idx_sources_fulltext ON fact_db_sources
|
|
37
|
+
USING gin(to_tsvector('english', content));
|
|
38
|
+
SQL
|
|
39
|
+
|
|
40
|
+
# HNSW index for vector similarity search
|
|
41
|
+
execute <<-SQL
|
|
42
|
+
CREATE INDEX idx_sources_embedding ON fact_db_sources
|
|
43
|
+
USING hnsw (embedding vector_cosine_ops);
|
|
44
|
+
SQL
|
|
45
|
+
|
|
46
|
+
execute "COMMENT ON COLUMN fact_db_sources.created_at IS 'When this record was created in the database';"
|
|
47
|
+
execute "COMMENT ON COLUMN fact_db_sources.updated_at IS 'When this record was last modified';"
|
|
48
|
+
end
|
|
49
|
+
end
|
|
@@ -2,35 +2,47 @@
|
|
|
2
2
|
|
|
3
3
|
class CreateEntities < ActiveRecord::Migration[7.0]
|
|
4
4
|
def change
|
|
5
|
-
create_table :fact_db_entities do |t|
|
|
6
|
-
|
|
7
|
-
|
|
8
|
-
t.string :
|
|
5
|
+
create_table :fact_db_entities, comment: "Canonical representations of people, organizations, places, and other named entities" do |t|
|
|
6
|
+
t.string :name, null: false, limit: 500,
|
|
7
|
+
comment: "Authoritative name for this entity after resolution and normalization"
|
|
8
|
+
t.string :kind, null: false, limit: 50,
|
|
9
|
+
comment: "Classification of entity (person, organization, location, product, event, etc.)"
|
|
9
10
|
|
|
10
|
-
|
|
11
|
-
|
|
12
|
-
t.bigint :
|
|
11
|
+
t.string :resolution_status, null: false, default: "unresolved", limit: 20,
|
|
12
|
+
comment: "Entity resolution state: unresolved, resolved, merged, or ambiguous"
|
|
13
|
+
t.bigint :canonical_id,
|
|
14
|
+
comment: "Reference to canonical entity if this entity was merged as a duplicate"
|
|
13
15
|
|
|
14
|
-
|
|
15
|
-
|
|
16
|
-
t.jsonb :metadata, null: false, default: {}
|
|
16
|
+
t.text :description,
|
|
17
|
+
comment: "Human-readable description providing context about this entity"
|
|
18
|
+
t.jsonb :metadata, null: false, default: {},
|
|
19
|
+
comment: "Flexible attributes specific to entity type (titles, roles, identifiers, etc.)"
|
|
17
20
|
|
|
18
|
-
|
|
19
|
-
|
|
21
|
+
t.vector :embedding, limit: 1536,
|
|
22
|
+
comment: "Vector embedding for semantic entity matching and similarity search"
|
|
20
23
|
|
|
21
24
|
t.timestamps
|
|
22
25
|
end
|
|
23
26
|
|
|
24
|
-
add_index :fact_db_entities, :
|
|
25
|
-
add_index :fact_db_entities, :
|
|
27
|
+
add_index :fact_db_entities, :name
|
|
28
|
+
add_index :fact_db_entities, :kind
|
|
26
29
|
add_index :fact_db_entities, :resolution_status
|
|
27
30
|
add_foreign_key :fact_db_entities, :fact_db_entities,
|
|
28
|
-
column: :
|
|
31
|
+
column: :canonical_id, on_delete: :nullify
|
|
29
32
|
|
|
30
33
|
# HNSW index for vector similarity search
|
|
31
34
|
execute <<-SQL
|
|
32
35
|
CREATE INDEX idx_entities_embedding ON fact_db_entities
|
|
33
36
|
USING hnsw (embedding vector_cosine_ops);
|
|
34
37
|
SQL
|
|
38
|
+
|
|
39
|
+
# GIN trigram index on name for fast fuzzy matching
|
|
40
|
+
execute <<-SQL
|
|
41
|
+
CREATE INDEX idx_entities_name_trgm ON fact_db_entities
|
|
42
|
+
USING gin (name gin_trgm_ops);
|
|
43
|
+
SQL
|
|
44
|
+
|
|
45
|
+
execute "COMMENT ON COLUMN fact_db_entities.created_at IS 'When this entity was first identified';"
|
|
46
|
+
execute "COMMENT ON COLUMN fact_db_entities.updated_at IS 'When this entity record was last modified';"
|
|
35
47
|
end
|
|
36
48
|
end
|