fact_db 0.0.2 → 0.0.3
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/.envrc +2 -0
- data/.yardopts +5 -0
- data/CHANGELOG.md +64 -0
- data/README.md +107 -6
- data/Rakefile +243 -10
- data/db/migrate/001_enable_extensions.rb +1 -0
- data/db/migrate/002_create_sources.rb +49 -0
- data/db/migrate/003_create_entities.rb +27 -15
- data/db/migrate/004_create_entity_aliases.rb +20 -7
- data/db/migrate/005_create_facts.rb +37 -21
- data/db/migrate/006_create_entity_mentions.rb +14 -6
- data/db/migrate/007_create_fact_sources.rb +16 -8
- data/docs/api/extractors/index.md +5 -5
- data/docs/api/extractors/llm.md +17 -17
- data/docs/api/extractors/rule-based.md +14 -14
- data/docs/api/facts.md +20 -20
- data/docs/api/index.md +4 -4
- data/docs/api/models/entity.md +21 -21
- data/docs/api/models/fact.md +15 -15
- data/docs/api/models/index.md +7 -7
- data/docs/api/models/{content.md → source.md} +29 -29
- data/docs/api/pipeline/extraction.md +25 -25
- data/docs/api/pipeline/index.md +1 -1
- data/docs/api/pipeline/resolution.md +4 -4
- data/docs/api/services/entity-service.md +20 -20
- data/docs/api/services/fact-service.md +12 -12
- data/docs/api/services/index.md +5 -5
- data/docs/api/services/{content-service.md → source-service.md} +27 -27
- data/docs/architecture/database-schema.md +46 -46
- data/docs/architecture/entity-resolution.md +6 -6
- data/docs/architecture/index.md +10 -10
- data/docs/architecture/temporal-facts.md +5 -5
- data/docs/architecture/three-layer-model.md +17 -17
- data/docs/concepts.md +6 -6
- data/docs/examples/basic-usage.md +20 -20
- data/docs/examples/hr-onboarding.md +17 -17
- data/docs/examples/index.md +4 -4
- data/docs/examples/news-analysis.md +23 -23
- data/docs/getting-started/database-setup.md +28 -20
- data/docs/getting-started/index.md +3 -3
- data/docs/getting-started/quick-start.md +33 -30
- data/docs/guides/batch-processing.md +26 -26
- data/docs/guides/configuration.md +158 -77
- data/docs/guides/entity-management.md +14 -14
- data/docs/guides/extracting-facts.md +28 -28
- data/docs/guides/ingesting-content.md +14 -14
- data/docs/guides/llm-integration.md +40 -32
- data/docs/guides/temporal-queries.md +11 -11
- data/docs/index.md +6 -2
- data/examples/.envrc +4 -0
- data/examples/.gitignore +1 -0
- data/examples/001_configuration.rb +312 -0
- data/examples/{basic_usage.rb → 010_basic_usage.rb} +47 -56
- data/examples/{entity_management.rb → 020_entity_management.rb} +57 -72
- data/examples/{temporal_queries.rb → 030_temporal_queries.rb} +39 -59
- data/examples/040_output_formats.rb +177 -0
- data/examples/{rule_based_extraction.rb → 050_rule_based_extraction.rb} +39 -45
- data/examples/060_fluent_temporal_api.rb +217 -0
- data/examples/070_introspection.rb +252 -0
- data/examples/{hr_system.rb → 080_hr_system.rb} +56 -75
- data/examples/090_ingest_demo.rb +515 -0
- data/examples/100_query_context.rb +668 -0
- data/examples/110_prove_it.rb +204 -0
- data/examples/120_dump_database.rb +358 -0
- data/examples/130_rag_feedback_loop.rb +858 -0
- data/examples/README.md +229 -15
- data/examples/data/lincoln_associates.md +201 -0
- data/examples/data/lincoln_biography.md +66 -0
- data/examples/data/lincoln_cabinet.md +243 -0
- data/examples/data/lincoln_family.md +163 -0
- data/examples/data/lincoln_military.md +241 -0
- data/examples/data/lincoln_todd_family.md +136 -0
- data/examples/ingest_reporter.rb +335 -0
- data/examples/utilities.rb +182 -0
- data/lib/fact_db/config/defaults.yml +254 -0
- data/lib/fact_db/config.rb +94 -35
- data/lib/fact_db/database.rb +98 -8
- data/lib/fact_db/extractors/base.rb +106 -21
- data/lib/fact_db/extractors/llm_extractor.rb +35 -63
- data/lib/fact_db/extractors/manual_extractor.rb +46 -6
- data/lib/fact_db/extractors/rule_based_extractor.rb +136 -25
- data/lib/fact_db/llm/adapter.rb +3 -3
- data/lib/fact_db/models/entity.rb +94 -22
- data/lib/fact_db/models/entity_alias.rb +41 -7
- data/lib/fact_db/models/entity_mention.rb +34 -1
- data/lib/fact_db/models/fact.rb +259 -28
- data/lib/fact_db/models/fact_source.rb +43 -9
- data/lib/fact_db/models/source.rb +113 -0
- data/lib/fact_db/pipeline/extraction_pipeline.rb +35 -35
- data/lib/fact_db/pipeline/resolution_pipeline.rb +5 -5
- data/lib/fact_db/query_result.rb +202 -0
- data/lib/fact_db/resolution/entity_resolver.rb +139 -39
- data/lib/fact_db/resolution/fact_resolver.rb +86 -14
- data/lib/fact_db/services/entity_service.rb +246 -37
- data/lib/fact_db/services/fact_service.rb +254 -17
- data/lib/fact_db/services/source_service.rb +164 -0
- data/lib/fact_db/temporal/query.rb +71 -7
- data/lib/fact_db/temporal/query_builder.rb +69 -0
- data/lib/fact_db/temporal/timeline.rb +102 -11
- data/lib/fact_db/transformers/base.rb +77 -0
- data/lib/fact_db/transformers/cypher_transformer.rb +185 -0
- data/lib/fact_db/transformers/json_transformer.rb +17 -0
- data/lib/fact_db/transformers/raw_transformer.rb +35 -0
- data/lib/fact_db/transformers/text_transformer.rb +114 -0
- data/lib/fact_db/transformers/triple_transformer.rb +138 -0
- data/lib/fact_db/validation/alias_filter.rb +185 -0
- data/lib/fact_db/version.rb +1 -1
- data/lib/fact_db.rb +281 -30
- data/mkdocs.yml +2 -2
- metadata +60 -16
- data/db/migrate/002_create_contents.rb +0 -44
- data/lib/fact_db/models/content.rb +0 -62
- data/lib/fact_db/services/content_service.rb +0 -93
|
@@ -0,0 +1,515 @@
|
|
|
1
|
+
#!/usr/bin/env ruby
|
|
2
|
+
# frozen_string_literal: true
|
|
3
|
+
|
|
4
|
+
# Ingest Demo for FactDb
|
|
5
|
+
#
|
|
6
|
+
# This example demonstrates building a fact database from a directory of markdown files
|
|
7
|
+
# using automatic entity and fact extraction:
|
|
8
|
+
# - Parsing markdown files with optional YAML frontmatter
|
|
9
|
+
# - Using LLM-based extraction to identify entities and facts
|
|
10
|
+
# - Automatic entity resolution and deduplication
|
|
11
|
+
# - Progressive entity discovery from text
|
|
12
|
+
#
|
|
13
|
+
# Usage:
|
|
14
|
+
# ruby ingest_demo.rb <directory> # Build/update database from directory
|
|
15
|
+
# ruby ingest_demo.rb <file.md> # Process a single markdown file
|
|
16
|
+
# ruby ingest_demo.rb <path> --rebuild # Drop and rebuild from scratch
|
|
17
|
+
# ruby ingest_demo.rb --stats # Show statistics only
|
|
18
|
+
|
|
19
|
+
require_relative "utilities"
|
|
20
|
+
require_relative "ingest_reporter"
|
|
21
|
+
require "yaml"
|
|
22
|
+
require "debug_me"
|
|
23
|
+
include DebugMe
|
|
24
|
+
require "amazing_print"
|
|
25
|
+
|
|
26
|
+
# Note: CLI tool - uses cli_setup! which does NOT reset database
|
|
27
|
+
# Use --rebuild flag to explicitly reset
|
|
28
|
+
|
|
29
|
+
class IngestDemo
|
|
30
|
+
def initialize(path:, rebuild: false, count: nil, reporter: nil)
|
|
31
|
+
@path = path
|
|
32
|
+
@is_file = File.file?(@path)
|
|
33
|
+
@directory = @is_file ? File.dirname(@path) : @path
|
|
34
|
+
@rebuild = rebuild
|
|
35
|
+
@count = count
|
|
36
|
+
@reporter = reporter || IngestReporter.new
|
|
37
|
+
setup_factdb
|
|
38
|
+
end
|
|
39
|
+
|
|
40
|
+
def run
|
|
41
|
+
unless File.exist?(@path)
|
|
42
|
+
puts "Error: Path not found: #{@path}"
|
|
43
|
+
exit 1
|
|
44
|
+
end
|
|
45
|
+
|
|
46
|
+
if @is_file && !@path.end_with?(".md")
|
|
47
|
+
puts "Error: File must be a markdown (.md) file: #{@path}"
|
|
48
|
+
exit 1
|
|
49
|
+
end
|
|
50
|
+
|
|
51
|
+
if @rebuild
|
|
52
|
+
puts "Rebuilding database from scratch..."
|
|
53
|
+
clear_all_data
|
|
54
|
+
end
|
|
55
|
+
|
|
56
|
+
puts "=" * 60
|
|
57
|
+
puts "Document Ingest Demo - FactDb (Automatic Extraction)"
|
|
58
|
+
puts "=" * 60
|
|
59
|
+
puts @is_file ? "Source file: #{@path}" : "Source directory: #{@directory}"
|
|
60
|
+
puts "Extractor: #{@extractor.class.name.split('::').last}"
|
|
61
|
+
puts
|
|
62
|
+
|
|
63
|
+
process_markdown_files
|
|
64
|
+
show_statistics
|
|
65
|
+
demonstrate_queries
|
|
66
|
+
|
|
67
|
+
puts "\n" + "=" * 60
|
|
68
|
+
puts "Document Ingest Complete!"
|
|
69
|
+
puts "=" * 60
|
|
70
|
+
end
|
|
71
|
+
|
|
72
|
+
def show_statistics_only
|
|
73
|
+
puts "=" * 60
|
|
74
|
+
puts "Database Statistics"
|
|
75
|
+
puts "=" * 60
|
|
76
|
+
|
|
77
|
+
show_statistics
|
|
78
|
+
end
|
|
79
|
+
|
|
80
|
+
private
|
|
81
|
+
|
|
82
|
+
def setup_factdb
|
|
83
|
+
# Ensure demo environment is set
|
|
84
|
+
DemoUtilities.ensure_demo_environment!
|
|
85
|
+
DemoUtilities.require_fact_db!
|
|
86
|
+
|
|
87
|
+
log_path = File.join(__dir__, "#{File.basename(__FILE__, '.rb')}.log")
|
|
88
|
+
|
|
89
|
+
FactDb.configure do |config|
|
|
90
|
+
config.default_extractor = :llm
|
|
91
|
+
config.logger = Logger.new(File.open(log_path, 'w'))
|
|
92
|
+
|
|
93
|
+
# Configure LLM client - uses environment variables by default
|
|
94
|
+
# Supports: ANTHROPIC_API_KEY, OPENAI_API_KEY, GEMINI_API_KEY, etc.
|
|
95
|
+
provider = ENV.fetch("FACT_DB_LLM_PROVIDER", "anthropic").to_sym
|
|
96
|
+
config.llm_client = FactDb::LLM::Adapter.new(provider: provider)
|
|
97
|
+
end
|
|
98
|
+
|
|
99
|
+
FactDb::Database.migrate!
|
|
100
|
+
|
|
101
|
+
@facts = FactDb.new
|
|
102
|
+
@entity_service = @facts.entity_service
|
|
103
|
+
@fact_service = @facts.fact_service
|
|
104
|
+
@source_service = @facts.source_service
|
|
105
|
+
@extractor = FactDb::Extractors::Base.for(:llm)
|
|
106
|
+
end
|
|
107
|
+
|
|
108
|
+
def clear_all_data
|
|
109
|
+
puts "Clearing all data from database..."
|
|
110
|
+
|
|
111
|
+
# Clear in order respecting foreign key constraints
|
|
112
|
+
FactDb::Models::FactSource.delete_all
|
|
113
|
+
FactDb::Models::EntityMention.delete_all
|
|
114
|
+
FactDb::Models::Fact.delete_all
|
|
115
|
+
FactDb::Models::EntityAlias.delete_all
|
|
116
|
+
FactDb::Models::Entity.delete_all
|
|
117
|
+
FactDb::Models::Source.delete_all
|
|
118
|
+
|
|
119
|
+
puts " All data cleared"
|
|
120
|
+
end
|
|
121
|
+
|
|
122
|
+
def clear_directory_data
|
|
123
|
+
puts "Clearing existing data from this directory..."
|
|
124
|
+
|
|
125
|
+
# Find and remove sources from this directory
|
|
126
|
+
dir_name = File.basename(@directory)
|
|
127
|
+
directory_sources = FactDb::Models::Source.where("metadata->>'source_directory' = ?", dir_name)
|
|
128
|
+
source_ids = directory_sources.pluck(:id)
|
|
129
|
+
|
|
130
|
+
if source_ids.any?
|
|
131
|
+
FactDb::Models::FactSource.where(source_id: source_ids).delete_all
|
|
132
|
+
directory_sources.delete_all
|
|
133
|
+
puts " Removed #{source_ids.count} source records"
|
|
134
|
+
end
|
|
135
|
+
|
|
136
|
+
puts " Data cleared"
|
|
137
|
+
end
|
|
138
|
+
|
|
139
|
+
def file_already_processed?(file_path)
|
|
140
|
+
filename = File.basename(file_path, ".md")
|
|
141
|
+
FactDb::Models::Source.exists?(title: filename)
|
|
142
|
+
end
|
|
143
|
+
|
|
144
|
+
def process_markdown_files
|
|
145
|
+
if @is_file
|
|
146
|
+
all_files = [@path]
|
|
147
|
+
else
|
|
148
|
+
all_files = Dir.glob(File.join(@directory, "*.md")).sort
|
|
149
|
+
end
|
|
150
|
+
|
|
151
|
+
# Filter to only unprocessed files
|
|
152
|
+
unprocessed_files = all_files.reject { |f| file_already_processed?(f) }
|
|
153
|
+
already_processed = all_files.count - unprocessed_files.count
|
|
154
|
+
|
|
155
|
+
files = @count ? unprocessed_files.first(@count) : unprocessed_files
|
|
156
|
+
|
|
157
|
+
@reporter.start_ingestion(
|
|
158
|
+
total_files: files.count,
|
|
159
|
+
source_path: @is_file ? @path : @directory
|
|
160
|
+
)
|
|
161
|
+
@reporter.report_already_processed(already_processed)
|
|
162
|
+
|
|
163
|
+
if files.empty?
|
|
164
|
+
@reporter.no_files_to_process
|
|
165
|
+
return
|
|
166
|
+
end
|
|
167
|
+
|
|
168
|
+
files.each_with_index do |file, index|
|
|
169
|
+
process_markdown_file(file, index + 1, files.count)
|
|
170
|
+
end
|
|
171
|
+
|
|
172
|
+
@reporter.finish_ingestion
|
|
173
|
+
end
|
|
174
|
+
|
|
175
|
+
def process_markdown_file(file_path, file_index, total_files)
|
|
176
|
+
filename = File.basename(file_path, ".md")
|
|
177
|
+
content_text = File.read(file_path)
|
|
178
|
+
|
|
179
|
+
@reporter.file_started(filename, file_index, total_files)
|
|
180
|
+
|
|
181
|
+
# Parse frontmatter and content
|
|
182
|
+
frontmatter, body = parse_frontmatter(content_text)
|
|
183
|
+
|
|
184
|
+
# Create source record for the document
|
|
185
|
+
source = find_or_create_source(filename, content_text, frontmatter)
|
|
186
|
+
|
|
187
|
+
# Split into paragraphs/sections for processing
|
|
188
|
+
sections = parse_sections(body)
|
|
189
|
+
|
|
190
|
+
# Process sections with LLM extraction
|
|
191
|
+
stats = process_sections_with_extraction(filename, sections, source)
|
|
192
|
+
|
|
193
|
+
@reporter.file_completed(**stats)
|
|
194
|
+
end
|
|
195
|
+
|
|
196
|
+
def parse_frontmatter(content)
|
|
197
|
+
if content.start_with?("---")
|
|
198
|
+
parts = content.split("---", 3)
|
|
199
|
+
if parts.length >= 3
|
|
200
|
+
frontmatter = YAML.safe_load(parts[1]) rescue {}
|
|
201
|
+
body = parts[2]
|
|
202
|
+
return [frontmatter, body]
|
|
203
|
+
end
|
|
204
|
+
end
|
|
205
|
+
[{}, content]
|
|
206
|
+
end
|
|
207
|
+
|
|
208
|
+
def find_or_create_source(filename, content_text, frontmatter)
|
|
209
|
+
title = frontmatter["title"] || filename
|
|
210
|
+
|
|
211
|
+
existing = FactDb::Models::Source.find_by(title: title)
|
|
212
|
+
return existing if existing
|
|
213
|
+
|
|
214
|
+
@source_service.create(
|
|
215
|
+
content_text,
|
|
216
|
+
kind: :document,
|
|
217
|
+
title: title,
|
|
218
|
+
metadata: frontmatter.merge(
|
|
219
|
+
source_directory: File.basename(@directory),
|
|
220
|
+
source_file: filename
|
|
221
|
+
)
|
|
222
|
+
)
|
|
223
|
+
end
|
|
224
|
+
|
|
225
|
+
def parse_sections(body)
|
|
226
|
+
sections = []
|
|
227
|
+
current_section = { heading: nil, text: "", start_line: 1, end_line: 1 }
|
|
228
|
+
line_number = 0
|
|
229
|
+
|
|
230
|
+
body.each_line do |line|
|
|
231
|
+
line_number += 1
|
|
232
|
+
line_stripped = line.strip
|
|
233
|
+
|
|
234
|
+
# Detect markdown headers
|
|
235
|
+
if line_stripped =~ /^(#+)\s+(.+)$/
|
|
236
|
+
# Save previous section if it has content
|
|
237
|
+
if current_section[:text].strip.length > 0
|
|
238
|
+
current_section[:end_line] = line_number - 1
|
|
239
|
+
sections << current_section
|
|
240
|
+
end
|
|
241
|
+
current_section = { heading: $2.strip, text: "", start_line: line_number, end_line: line_number }
|
|
242
|
+
elsif !line_stripped.empty? && line_stripped != "---"
|
|
243
|
+
current_section[:text] += " " unless current_section[:text].empty?
|
|
244
|
+
current_section[:text] += line_stripped
|
|
245
|
+
current_section[:end_line] = line_number
|
|
246
|
+
end
|
|
247
|
+
end
|
|
248
|
+
|
|
249
|
+
# Add final section
|
|
250
|
+
if current_section[:text].strip.length > 0
|
|
251
|
+
current_section[:end_line] = line_number
|
|
252
|
+
sections << current_section
|
|
253
|
+
end
|
|
254
|
+
|
|
255
|
+
sections
|
|
256
|
+
end
|
|
257
|
+
|
|
258
|
+
def process_sections_with_extraction(filename, sections, source)
|
|
259
|
+
stats = { facts: 0, entities: 0, skipped: 0, errors: 0 }
|
|
260
|
+
total_sections = sections.count
|
|
261
|
+
|
|
262
|
+
sections.each_with_index do |section, index|
|
|
263
|
+
section_text = clean_text(section[:text])
|
|
264
|
+
next if section_text.empty? || section_text.length < 10
|
|
265
|
+
|
|
266
|
+
section_ref = section[:heading] || "Section #{index + 1}"
|
|
267
|
+
@reporter.section_started(section_ref, index + 1, total_sections)
|
|
268
|
+
|
|
269
|
+
# Skip if facts already exist for this section
|
|
270
|
+
fact_identifier = "#{filename}: #{section_ref}"
|
|
271
|
+
existing = FactDb::Models::Fact.where("metadata->>'section_ref' = ?", fact_identifier).first
|
|
272
|
+
if existing
|
|
273
|
+
stats[:skipped] += 1
|
|
274
|
+
@reporter.section_skipped(section_ref)
|
|
275
|
+
@reporter.section_completed
|
|
276
|
+
next
|
|
277
|
+
end
|
|
278
|
+
|
|
279
|
+
begin
|
|
280
|
+
# Extract atomic facts from section text with progress feedback
|
|
281
|
+
extracted_facts = extract_with_progress(section_text)
|
|
282
|
+
|
|
283
|
+
section_facts = 0
|
|
284
|
+
section_entities = 0
|
|
285
|
+
|
|
286
|
+
extracted_facts.each do |fact_data|
|
|
287
|
+
# Resolve/create entities from mentions and build mention references
|
|
288
|
+
mentions = []
|
|
289
|
+
(fact_data[:mentions] || []).each do |mention_data|
|
|
290
|
+
entity = @entity_service.resolve_or_create(
|
|
291
|
+
mention_data[:name],
|
|
292
|
+
kind: normalize_kind(mention_data[:kind]),
|
|
293
|
+
aliases: mention_data[:aliases] || [],
|
|
294
|
+
description: "Extracted from #{filename}"
|
|
295
|
+
)
|
|
296
|
+
|
|
297
|
+
# Add any aliases that weren't already added during creation
|
|
298
|
+
(mention_data[:aliases] || []).each do |alias_text|
|
|
299
|
+
next if alias_text.to_s.strip.empty?
|
|
300
|
+
next if entity.name.downcase == alias_text.to_s.strip.downcase
|
|
301
|
+
next if entity.all_aliases.map(&:downcase).include?(alias_text.to_s.strip.downcase)
|
|
302
|
+
|
|
303
|
+
entity.add_alias(alias_text.to_s.strip)
|
|
304
|
+
end
|
|
305
|
+
|
|
306
|
+
mentions << {
|
|
307
|
+
entity_id: entity.id,
|
|
308
|
+
role: mention_data[:role] || determine_role(mention_data[:type]),
|
|
309
|
+
text: mention_data[:name]
|
|
310
|
+
}
|
|
311
|
+
section_entities += 1
|
|
312
|
+
end
|
|
313
|
+
|
|
314
|
+
# Create the atomic fact
|
|
315
|
+
fact_metadata = {
|
|
316
|
+
source_file: filename,
|
|
317
|
+
section_heading: section[:heading],
|
|
318
|
+
section_ref: fact_identifier,
|
|
319
|
+
line_start: section[:start_line],
|
|
320
|
+
line_end: section[:end_line]
|
|
321
|
+
}.compact
|
|
322
|
+
|
|
323
|
+
fact = @fact_service.create(
|
|
324
|
+
fact_data[:text],
|
|
325
|
+
valid_at: fact_data[:valid_at] || Date.today,
|
|
326
|
+
invalid_at: fact_data[:invalid_at],
|
|
327
|
+
extraction_method: :llm,
|
|
328
|
+
confidence: fact_data[:confidence] || 0.8,
|
|
329
|
+
mentions: mentions.uniq { |m| m[:entity_id] },
|
|
330
|
+
metadata: fact_metadata
|
|
331
|
+
)
|
|
332
|
+
|
|
333
|
+
fact.add_source(source: source, kind: :primary, confidence: 1.0)
|
|
334
|
+
section_facts += 1
|
|
335
|
+
end
|
|
336
|
+
|
|
337
|
+
@reporter.extraction_completed(facts_count: section_facts, entities_count: section_entities)
|
|
338
|
+
stats[:facts] += section_facts
|
|
339
|
+
stats[:entities] += section_entities
|
|
340
|
+
|
|
341
|
+
rescue StandardError => e
|
|
342
|
+
debug_me { [:section_ref, :e] }
|
|
343
|
+
@reporter.error_occurred(e, context: section_ref)
|
|
344
|
+
stats[:errors] += 1
|
|
345
|
+
end
|
|
346
|
+
|
|
347
|
+
@reporter.section_completed
|
|
348
|
+
end
|
|
349
|
+
|
|
350
|
+
stats
|
|
351
|
+
end
|
|
352
|
+
|
|
353
|
+
# Extract facts with periodic progress updates
|
|
354
|
+
def extract_with_progress(text)
|
|
355
|
+
@reporter.extraction_started
|
|
356
|
+
|
|
357
|
+
# Run extraction in a thread so we can update progress
|
|
358
|
+
result = nil
|
|
359
|
+
extraction_thread = Thread.new do
|
|
360
|
+
result = @extractor.extract(text)
|
|
361
|
+
end
|
|
362
|
+
|
|
363
|
+
# Update progress while extraction runs
|
|
364
|
+
while extraction_thread.alive?
|
|
365
|
+
@reporter.extraction_progress
|
|
366
|
+
sleep 0.15
|
|
367
|
+
end
|
|
368
|
+
|
|
369
|
+
extraction_thread.join
|
|
370
|
+
result
|
|
371
|
+
end
|
|
372
|
+
|
|
373
|
+
def clean_text(text)
|
|
374
|
+
text
|
|
375
|
+
.gsub(/\*\*/, "") # Remove bold markers
|
|
376
|
+
.gsub(/\*/, "") # Remove italic markers
|
|
377
|
+
.gsub(/`[^`]+`/, "") # Remove inline code
|
|
378
|
+
.gsub(/\[([^\]]+)\]\([^)]+\)/, '\1') # Convert links to text
|
|
379
|
+
.gsub(/#+\s*/, "") # Remove header markers
|
|
380
|
+
.strip
|
|
381
|
+
end
|
|
382
|
+
|
|
383
|
+
def determine_role(entity_type)
|
|
384
|
+
# Valid roles: subject, object, location, temporal, instrument, beneficiary
|
|
385
|
+
case entity_type.to_s
|
|
386
|
+
when "person" then :subject
|
|
387
|
+
when "place" then :location
|
|
388
|
+
when "organization" then :object
|
|
389
|
+
when "event" then :temporal
|
|
390
|
+
else :subject
|
|
391
|
+
end
|
|
392
|
+
end
|
|
393
|
+
|
|
394
|
+
def normalize_kind(kind)
|
|
395
|
+
return :concept if kind.nil?
|
|
396
|
+
|
|
397
|
+
kind_sym = kind.to_s.downcase.to_sym
|
|
398
|
+
valid_kinds = FactDb::Models::Entity::ENTITY_KINDS.map(&:to_sym)
|
|
399
|
+
|
|
400
|
+
valid_kinds.include?(kind_sym) ? kind_sym : :other
|
|
401
|
+
end
|
|
402
|
+
|
|
403
|
+
def show_statistics
|
|
404
|
+
puts "\n--- Database Statistics ---\n"
|
|
405
|
+
|
|
406
|
+
puts "Sources:"
|
|
407
|
+
ap @source_service.stats
|
|
408
|
+
|
|
409
|
+
puts "\nEntities:"
|
|
410
|
+
ap @entity_service.stats
|
|
411
|
+
|
|
412
|
+
puts "\nFacts:"
|
|
413
|
+
ap @fact_service.stats
|
|
414
|
+
|
|
415
|
+
# Directory-specific stats if available
|
|
416
|
+
if @directory
|
|
417
|
+
dir_name = File.basename(@directory)
|
|
418
|
+
dir_sources = FactDb::Models::Source.where("metadata->>'source_directory' = ?", dir_name).count
|
|
419
|
+
dir_facts = FactDb::Models::Fact.where("metadata->>'source_file' IS NOT NULL").count
|
|
420
|
+
|
|
421
|
+
puts "\nDirectory '#{dir_name}':"
|
|
422
|
+
puts " Documents loaded: #{dir_sources}"
|
|
423
|
+
puts " Facts extracted: #{dir_facts}"
|
|
424
|
+
end
|
|
425
|
+
|
|
426
|
+
# Show discovered entities by kind
|
|
427
|
+
puts "\nDiscovered entities by kind:"
|
|
428
|
+
ap FactDb::Models::Entity.group(:kind).count
|
|
429
|
+
end
|
|
430
|
+
|
|
431
|
+
def demonstrate_queries
|
|
432
|
+
puts "\n--- Sample Queries ---\n"
|
|
433
|
+
|
|
434
|
+
# Show some discovered entities
|
|
435
|
+
puts "\nRecently discovered entities (last 10):"
|
|
436
|
+
recent_entities = FactDb::Models::Entity.order(created_at: :desc).limit(10)
|
|
437
|
+
recent_entities.each do |entity|
|
|
438
|
+
fact_count = entity.facts.count
|
|
439
|
+
puts " #{entity.name} (#{entity.kind}) - #{fact_count} mentions"
|
|
440
|
+
end
|
|
441
|
+
|
|
442
|
+
# Show recent facts
|
|
443
|
+
puts "\nRecent facts (last 5):"
|
|
444
|
+
recent_facts = FactDb::Models::Fact.order(created_at: :desc).limit(5)
|
|
445
|
+
recent_facts.each do |fact|
|
|
446
|
+
puts " #{fact.text[0..80]}..."
|
|
447
|
+
end
|
|
448
|
+
end
|
|
449
|
+
end
|
|
450
|
+
|
|
451
|
+
# Main execution
|
|
452
|
+
if __FILE__ == $PROGRAM_NAME
|
|
453
|
+
options = { rebuild: false, path: nil, count: nil, reporter: nil }
|
|
454
|
+
|
|
455
|
+
args = ARGV.dup
|
|
456
|
+
while arg = args.shift
|
|
457
|
+
case arg
|
|
458
|
+
when "--rebuild"
|
|
459
|
+
options[:rebuild] = true
|
|
460
|
+
when "--count"
|
|
461
|
+
options[:count] = args.shift.to_i
|
|
462
|
+
when "--quiet", "-q"
|
|
463
|
+
options[:reporter] = QuietReporter.new
|
|
464
|
+
when "--verbose", "-v"
|
|
465
|
+
options[:reporter] = VerboseReporter.new
|
|
466
|
+
when "--stats"
|
|
467
|
+
IngestDemo.new(path: ".").show_statistics_only
|
|
468
|
+
exit 0
|
|
469
|
+
when "--help", "-h"
|
|
470
|
+
puts <<~HELP
|
|
471
|
+
Document Ingest Demo for FactDb (Automatic Extraction)
|
|
472
|
+
|
|
473
|
+
Usage:
|
|
474
|
+
ruby ingest_demo.rb <directory> # Build/update database from directory
|
|
475
|
+
ruby ingest_demo.rb <file.md> # Process a single markdown file
|
|
476
|
+
ruby ingest_demo.rb <path> --rebuild # Drop and rebuild from scratch
|
|
477
|
+
ruby ingest_demo.rb <directory> --count 3 # Process only first 3 files
|
|
478
|
+
ruby ingest_demo.rb --stats # Show statistics only
|
|
479
|
+
|
|
480
|
+
Options:
|
|
481
|
+
--rebuild Clear existing data and rebuild from scratch
|
|
482
|
+
--count <n> Process only the first n files (for testing, directory only)
|
|
483
|
+
--quiet, -q Minimal output (good for scripts/CI)
|
|
484
|
+
--verbose, -v Detailed output with section-level progress
|
|
485
|
+
--stats Show database statistics only
|
|
486
|
+
--help, -h Show this help message
|
|
487
|
+
|
|
488
|
+
Environment variables:
|
|
489
|
+
FACT_DB_LLM_PROVIDER # LLM provider (anthropic, openai, gemini, ollama)
|
|
490
|
+
ANTHROPIC_API_KEY # API key for Anthropic
|
|
491
|
+
OPENAI_API_KEY # API key for OpenAI
|
|
492
|
+
DATABASE_URL # PostgreSQL connection URL
|
|
493
|
+
|
|
494
|
+
Accepts either a directory containing markdown (.md) files or a single
|
|
495
|
+
markdown file. Files may optionally include YAML frontmatter between
|
|
496
|
+
--- delimiters at the start of the file.
|
|
497
|
+
|
|
498
|
+
Reporter classes (IngestReporter, QuietReporter, VerboseReporter) can be
|
|
499
|
+
used directly in your own applications for custom progress handling.
|
|
500
|
+
HELP
|
|
501
|
+
exit 0
|
|
502
|
+
else
|
|
503
|
+
options[:path] = arg unless arg.start_with?("-")
|
|
504
|
+
end
|
|
505
|
+
end
|
|
506
|
+
|
|
507
|
+
unless options[:path]
|
|
508
|
+
puts "Error: Please specify a directory or markdown file"
|
|
509
|
+
puts "Usage: ruby ingest_demo.rb <directory|file.md>"
|
|
510
|
+
puts "Run 'ruby ingest_demo.rb --help' for more information"
|
|
511
|
+
exit 1
|
|
512
|
+
end
|
|
513
|
+
|
|
514
|
+
IngestDemo.new(**options).run
|
|
515
|
+
end
|