fact_db 0.0.2 → 0.0.3

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (114) hide show
  1. checksums.yaml +4 -4
  2. data/.envrc +2 -0
  3. data/.yardopts +5 -0
  4. data/CHANGELOG.md +64 -0
  5. data/README.md +107 -6
  6. data/Rakefile +243 -10
  7. data/db/migrate/001_enable_extensions.rb +1 -0
  8. data/db/migrate/002_create_sources.rb +49 -0
  9. data/db/migrate/003_create_entities.rb +27 -15
  10. data/db/migrate/004_create_entity_aliases.rb +20 -7
  11. data/db/migrate/005_create_facts.rb +37 -21
  12. data/db/migrate/006_create_entity_mentions.rb +14 -6
  13. data/db/migrate/007_create_fact_sources.rb +16 -8
  14. data/docs/api/extractors/index.md +5 -5
  15. data/docs/api/extractors/llm.md +17 -17
  16. data/docs/api/extractors/rule-based.md +14 -14
  17. data/docs/api/facts.md +20 -20
  18. data/docs/api/index.md +4 -4
  19. data/docs/api/models/entity.md +21 -21
  20. data/docs/api/models/fact.md +15 -15
  21. data/docs/api/models/index.md +7 -7
  22. data/docs/api/models/{content.md → source.md} +29 -29
  23. data/docs/api/pipeline/extraction.md +25 -25
  24. data/docs/api/pipeline/index.md +1 -1
  25. data/docs/api/pipeline/resolution.md +4 -4
  26. data/docs/api/services/entity-service.md +20 -20
  27. data/docs/api/services/fact-service.md +12 -12
  28. data/docs/api/services/index.md +5 -5
  29. data/docs/api/services/{content-service.md → source-service.md} +27 -27
  30. data/docs/architecture/database-schema.md +46 -46
  31. data/docs/architecture/entity-resolution.md +6 -6
  32. data/docs/architecture/index.md +10 -10
  33. data/docs/architecture/temporal-facts.md +5 -5
  34. data/docs/architecture/three-layer-model.md +17 -17
  35. data/docs/concepts.md +6 -6
  36. data/docs/examples/basic-usage.md +20 -20
  37. data/docs/examples/hr-onboarding.md +17 -17
  38. data/docs/examples/index.md +4 -4
  39. data/docs/examples/news-analysis.md +23 -23
  40. data/docs/getting-started/database-setup.md +28 -20
  41. data/docs/getting-started/index.md +3 -3
  42. data/docs/getting-started/quick-start.md +33 -30
  43. data/docs/guides/batch-processing.md +26 -26
  44. data/docs/guides/configuration.md +158 -77
  45. data/docs/guides/entity-management.md +14 -14
  46. data/docs/guides/extracting-facts.md +28 -28
  47. data/docs/guides/ingesting-content.md +14 -14
  48. data/docs/guides/llm-integration.md +40 -32
  49. data/docs/guides/temporal-queries.md +11 -11
  50. data/docs/index.md +6 -2
  51. data/examples/.envrc +4 -0
  52. data/examples/.gitignore +1 -0
  53. data/examples/001_configuration.rb +312 -0
  54. data/examples/{basic_usage.rb → 010_basic_usage.rb} +47 -56
  55. data/examples/{entity_management.rb → 020_entity_management.rb} +57 -72
  56. data/examples/{temporal_queries.rb → 030_temporal_queries.rb} +39 -59
  57. data/examples/040_output_formats.rb +177 -0
  58. data/examples/{rule_based_extraction.rb → 050_rule_based_extraction.rb} +39 -45
  59. data/examples/060_fluent_temporal_api.rb +217 -0
  60. data/examples/070_introspection.rb +252 -0
  61. data/examples/{hr_system.rb → 080_hr_system.rb} +56 -75
  62. data/examples/090_ingest_demo.rb +515 -0
  63. data/examples/100_query_context.rb +668 -0
  64. data/examples/110_prove_it.rb +204 -0
  65. data/examples/120_dump_database.rb +358 -0
  66. data/examples/130_rag_feedback_loop.rb +858 -0
  67. data/examples/README.md +229 -15
  68. data/examples/data/lincoln_associates.md +201 -0
  69. data/examples/data/lincoln_biography.md +66 -0
  70. data/examples/data/lincoln_cabinet.md +243 -0
  71. data/examples/data/lincoln_family.md +163 -0
  72. data/examples/data/lincoln_military.md +241 -0
  73. data/examples/data/lincoln_todd_family.md +136 -0
  74. data/examples/ingest_reporter.rb +335 -0
  75. data/examples/utilities.rb +182 -0
  76. data/lib/fact_db/config/defaults.yml +254 -0
  77. data/lib/fact_db/config.rb +94 -35
  78. data/lib/fact_db/database.rb +98 -8
  79. data/lib/fact_db/extractors/base.rb +106 -21
  80. data/lib/fact_db/extractors/llm_extractor.rb +35 -63
  81. data/lib/fact_db/extractors/manual_extractor.rb +46 -6
  82. data/lib/fact_db/extractors/rule_based_extractor.rb +136 -25
  83. data/lib/fact_db/llm/adapter.rb +3 -3
  84. data/lib/fact_db/models/entity.rb +94 -22
  85. data/lib/fact_db/models/entity_alias.rb +41 -7
  86. data/lib/fact_db/models/entity_mention.rb +34 -1
  87. data/lib/fact_db/models/fact.rb +259 -28
  88. data/lib/fact_db/models/fact_source.rb +43 -9
  89. data/lib/fact_db/models/source.rb +113 -0
  90. data/lib/fact_db/pipeline/extraction_pipeline.rb +35 -35
  91. data/lib/fact_db/pipeline/resolution_pipeline.rb +5 -5
  92. data/lib/fact_db/query_result.rb +202 -0
  93. data/lib/fact_db/resolution/entity_resolver.rb +139 -39
  94. data/lib/fact_db/resolution/fact_resolver.rb +86 -14
  95. data/lib/fact_db/services/entity_service.rb +246 -37
  96. data/lib/fact_db/services/fact_service.rb +254 -17
  97. data/lib/fact_db/services/source_service.rb +164 -0
  98. data/lib/fact_db/temporal/query.rb +71 -7
  99. data/lib/fact_db/temporal/query_builder.rb +69 -0
  100. data/lib/fact_db/temporal/timeline.rb +102 -11
  101. data/lib/fact_db/transformers/base.rb +77 -0
  102. data/lib/fact_db/transformers/cypher_transformer.rb +185 -0
  103. data/lib/fact_db/transformers/json_transformer.rb +17 -0
  104. data/lib/fact_db/transformers/raw_transformer.rb +35 -0
  105. data/lib/fact_db/transformers/text_transformer.rb +114 -0
  106. data/lib/fact_db/transformers/triple_transformer.rb +138 -0
  107. data/lib/fact_db/validation/alias_filter.rb +185 -0
  108. data/lib/fact_db/version.rb +1 -1
  109. data/lib/fact_db.rb +281 -30
  110. data/mkdocs.yml +2 -2
  111. metadata +60 -16
  112. data/db/migrate/002_create_contents.rb +0 -44
  113. data/lib/fact_db/models/content.rb +0 -62
  114. data/lib/fact_db/services/content_service.rb +0 -93
@@ -6,8 +6,8 @@ Stores resolved identities (people, organizations, places, etc.).
6
6
 
7
7
  ```ruby
8
8
  entity = FactDb::Models::Entity.new(
9
- canonical_name: "Paula Chen",
10
- entity_type: "person"
9
+ name: "Paula Chen",
10
+ kind: "person"
11
11
  )
12
12
  ```
13
13
 
@@ -16,15 +16,15 @@ entity = FactDb::Models::Entity.new(
16
16
  | Attribute | Type | Description |
17
17
  |-----------|------|-------------|
18
18
  | `id` | Integer | Primary key |
19
- | `canonical_name` | String | Authoritative name |
20
- | `entity_type` | String | Type (person, organization, place, etc.) |
19
+ | `name` | String | Authoritative name |
20
+ | `kind` | String | Kind (person, organization, place, etc.) |
21
21
  | `resolution_status` | String | Status (unresolved, resolved, merged) |
22
- | `merged_into_id` | Integer | Points to canonical entity if merged |
22
+ | `canonical_id` | Integer | Points to canonical entity if merged |
23
23
  | `metadata` | Hash | Additional attributes (JSONB) |
24
24
  | `embedding` | Vector | Semantic search vector |
25
25
  | `created_at` | DateTime | Record creation time |
26
26
 
27
- ## Entity Types
27
+ ## Entity Kinds
28
28
 
29
29
  - `person` - Individual people
30
30
  - `organization` - Companies, teams, groups
@@ -52,7 +52,7 @@ belongs_to :merged_into, class_name: 'Entity', optional: true
52
52
  ### add_alias
53
53
 
54
54
  ```ruby
55
- def add_alias(text, type: nil, confidence: 1.0)
55
+ def add_alias(text, kind: nil, confidence: 1.0)
56
56
  ```
57
57
 
58
58
  Add an alias to the entity.
@@ -60,7 +60,7 @@ Add an alias to the entity.
60
60
  **Example:**
61
61
 
62
62
  ```ruby
63
- entity.add_alias("Paula", type: "nickname", confidence: 0.95)
63
+ entity.add_alias("Paula", kind: "nickname", confidence: 0.95)
64
64
  ```
65
65
 
66
66
  ### merged?
@@ -88,16 +88,16 @@ canonical = entity.canonical # Returns the canonical entity
88
88
 
89
89
  ## Scopes
90
90
 
91
- ### by_type
91
+ ### by_kind
92
92
 
93
93
  ```ruby
94
- scope :by_type, ->(type) { where(entity_type: type) }
94
+ scope :by_kind, ->(k) { where(kind: k) }
95
95
  ```
96
96
 
97
- Filter by entity type.
97
+ Filter by entity kind.
98
98
 
99
99
  ```ruby
100
- Entity.by_type('person')
100
+ Entity.by_kind('person')
101
101
  ```
102
102
 
103
103
  ### active
@@ -124,7 +124,7 @@ Only resolved entities.
124
124
 
125
125
  ```ruby
126
126
  scope :search_name, ->(query) {
127
- where("canonical_name ILIKE ?", "%#{query}%")
127
+ where("name ILIKE ?", "%#{query}%")
128
128
  }
129
129
  ```
130
130
 
@@ -140,8 +140,8 @@ Entity.search_name("paula")
140
140
 
141
141
  ```ruby
142
142
  entity = Entity.create!(
143
- canonical_name: "Paula Chen",
144
- entity_type: "person",
143
+ name: "Paula Chen",
144
+ kind: "person",
145
145
  metadata: {
146
146
  department: "Engineering",
147
147
  employee_id: "E12345"
@@ -153,15 +153,15 @@ entity = Entity.create!(
153
153
 
154
154
  ```ruby
155
155
  entity.add_alias("Paula")
156
- entity.add_alias("P. Chen", type: "abbreviation")
157
- entity.add_alias("Chen, Paula", type: "formal")
156
+ entity.add_alias("P. Chen", kind: "abbreviation")
157
+ entity.add_alias("Chen, Paula", kind: "formal")
158
158
  ```
159
159
 
160
160
  ### Check Aliases
161
161
 
162
162
  ```ruby
163
163
  entity.entity_aliases.each do |a|
164
- puts "#{a.alias_text} (#{a.alias_type})"
164
+ puts "#{a.name} (#{a.kind})"
165
165
  end
166
166
  ```
167
167
 
@@ -169,7 +169,7 @@ end
169
169
 
170
170
  ```ruby
171
171
  entity.facts.each do |fact|
172
- puts "#{fact.valid_at}: #{fact.fact_text}"
172
+ puts "#{fact.valid_at}: #{fact.text}"
173
173
  end
174
174
  ```
175
175
 
@@ -192,11 +192,11 @@ similar = Entity
192
192
  # entity2 will be merged into entity1
193
193
  entity2.update!(
194
194
  resolution_status: 'merged',
195
- merged_into_id: entity1.id
195
+ canonical_id: entity1.id
196
196
  )
197
197
 
198
198
  # Copy aliases
199
199
  entity2.entity_aliases.each do |a|
200
- entity1.add_alias(a.alias_text, type: a.alias_type)
200
+ entity1.add_alias(a.name, kind: a.kind)
201
201
  end
202
202
  ```
@@ -6,7 +6,7 @@ Stores temporal assertions about entities.
6
6
 
7
7
  ```ruby
8
8
  fact = FactDb::Models::Fact.new(
9
- fact_text: "Paula Chen is Principal Engineer",
9
+ text: "Paula Chen is Principal Engineer",
10
10
  valid_at: Date.parse("2024-01-10"),
11
11
  status: "canonical"
12
12
  )
@@ -17,8 +17,8 @@ fact = FactDb::Models::Fact.new(
17
17
  | Attribute | Type | Description |
18
18
  |-----------|------|-------------|
19
19
  | `id` | Integer | Primary key |
20
- | `fact_text` | Text | The assertion |
21
- | `fact_hash` | String | Hash for deduplication |
20
+ | `text` | Text | The assertion |
21
+ | `digest` | String | SHA256 digest for deduplication |
22
22
  | `valid_at` | DateTime | When fact became true |
23
23
  | `invalid_at` | DateTime | When fact stopped being true (nil if current) |
24
24
  | `status` | String | Status (canonical, superseded, corroborated, synthesized) |
@@ -72,7 +72,7 @@ fact.add_mention(
72
72
  ### add_source
73
73
 
74
74
  ```ruby
75
- def add_source(content:, type: "primary", excerpt: nil, confidence: 1.0)
75
+ def add_source(source:, kind: "primary", excerpt: nil, confidence: 1.0)
76
76
  ```
77
77
 
78
78
  Add a source content link.
@@ -81,8 +81,8 @@ Add a source content link.
81
81
 
82
82
  ```ruby
83
83
  fact.add_source(
84
- content: email,
85
- type: "primary",
84
+ source: email,
85
+ kind: "primary",
86
86
  excerpt: "...accepted the offer..."
87
87
  )
88
88
  ```
@@ -169,7 +169,7 @@ Fact.mentioning_entity(paula.id)
169
169
 
170
170
  ```ruby
171
171
  scope :search_text, ->(query) {
172
- where("fact_text @@ plainto_tsquery(?)", query)
172
+ where("text @@ plainto_tsquery(?)", query)
173
173
  }
174
174
  ```
175
175
 
@@ -207,7 +207,7 @@ High confidence facts only.
207
207
 
208
208
  ```ruby
209
209
  fact = Fact.create!(
210
- fact_text: "Paula Chen joined Microsoft as Principal Engineer",
210
+ text: "Paula Chen joined Microsoft as Principal Engineer",
211
211
  valid_at: Date.parse("2024-01-10"),
212
212
  status: "canonical",
213
213
  extraction_method: "manual",
@@ -219,7 +219,7 @@ fact.add_mention(entity: paula, text: "Paula Chen", role: "subject")
219
219
  fact.add_mention(entity: microsoft, text: "Microsoft", role: "organization")
220
220
 
221
221
  # Add source
222
- fact.add_source(content: announcement, type: "primary")
222
+ fact.add_source(source: announcement, kind: "primary")
223
223
  ```
224
224
 
225
225
  ### Query Facts
@@ -239,7 +239,7 @@ Fact.search_text("promoted")
239
239
 
240
240
  ```ruby
241
241
  new_fact = Fact.create!(
242
- fact_text: "Paula Chen is Senior Principal Engineer",
242
+ text: "Paula Chen is Senior Principal Engineer",
243
243
  valid_at: Date.parse("2024-06-01"),
244
244
  status: "canonical"
245
245
  )
@@ -254,10 +254,10 @@ old_fact.update!(
254
254
  ### Get Sources
255
255
 
256
256
  ```ruby
257
- fact.fact_sources.each do |source|
258
- puts "Source: #{source.content.title}"
259
- puts "Type: #{source.source_type}"
260
- puts "Excerpt: #{source.excerpt}"
257
+ fact.fact_sources.each do |fact_source|
258
+ puts "Source: #{fact_source.source.title}"
259
+ puts "Kind: #{fact_source.kind}"
260
+ puts "Excerpt: #{fact_source.excerpt}"
261
261
  end
262
262
  ```
263
263
 
@@ -265,6 +265,6 @@ end
265
265
 
266
266
  ```ruby
267
267
  fact.entity_mentions.each do |mention|
268
- puts "#{mention.entity.canonical_name} (#{mention.mention_role})"
268
+ puts "#{mention.entity.name} (#{mention.mention_role})"
269
269
  end
270
270
  ```
@@ -4,7 +4,7 @@ FactDb uses ActiveRecord models for data persistence.
4
4
 
5
5
  ## Core Models
6
6
 
7
- - [Content](content.md) - Immutable source documents
7
+ - [Source](source.md) - Immutable source content
8
8
  - [Entity](entity.md) - Resolved identities with aliases
9
9
  - [Fact](fact.md) - Temporal assertions
10
10
 
@@ -23,8 +23,8 @@ end
23
23
  | Column | Type | Description |
24
24
  |--------|------|-------------|
25
25
  | entity_id | bigint | Parent entity |
26
- | alias_text | string | Alternative name |
27
- | alias_type | string | Type (nickname, abbreviation, etc.) |
26
+ | name | string | Alternative name |
27
+ | type | string | Type (nickname, abbreviation, etc.) |
28
28
  | confidence | float | Match confidence |
29
29
 
30
30
  ### EntityMention
@@ -53,15 +53,15 @@ Links facts to source content.
53
53
  ```ruby
54
54
  class FactSource < ActiveRecord::Base
55
55
  belongs_to :fact
56
- belongs_to :content
56
+ belongs_to :source
57
57
  end
58
58
  ```
59
59
 
60
60
  | Column | Type | Description |
61
61
  |--------|------|-------------|
62
62
  | fact_id | bigint | Parent fact |
63
- | content_id | bigint | Source content |
64
- | source_type | string | Type (primary, supporting, contradicting) |
63
+ | source_id | bigint | Source content |
64
+ | kind | string | Kind (primary, supporting, corroborating) |
65
65
  | excerpt | text | Relevant text excerpt |
66
66
  | confidence | float | Source confidence |
67
67
 
@@ -69,7 +69,7 @@ end
69
69
 
70
70
  ```mermaid
71
71
  erDiagram
72
- Content ||--o{ FactSource : "sourced by"
72
+ Source ||--o{ FactSource : "sourced by"
73
73
  Entity ||--o{ EntityAlias : "has"
74
74
  Entity ||--o{ EntityMention : "mentioned in"
75
75
  Fact ||--o{ EntityMention : "mentions"
@@ -1,13 +1,13 @@
1
- # Content Model
1
+ # Source Model
2
2
 
3
- Stores immutable source documents.
3
+ Stores immutable source content from which facts are extracted.
4
4
 
5
- ## Class: `FactDb::Models::Content`
5
+ ## Class: `FactDb::Models::Source`
6
6
 
7
7
  ```ruby
8
- content = FactDb::Models::Content.new(
9
- raw_text: "Document content...",
10
- content_type: "email",
8
+ source = FactDb::Models::Source.new(
9
+ content: "Document content...",
10
+ kind: "email",
11
11
  captured_at: Time.current
12
12
  )
13
13
  ```
@@ -18,11 +18,11 @@ content = FactDb::Models::Content.new(
18
18
  |-----------|------|-------------|
19
19
  | `id` | Integer | Primary key |
20
20
  | `content_hash` | String | SHA256 hash for deduplication |
21
- | `content_type` | String | Type (email, document, etc.) |
22
- | `raw_text` | Text | Original content |
21
+ | `kind` | String | Kind (email, document, etc.) |
22
+ | `content` | Text | Original unmodified text content |
23
23
  | `title` | String | Optional title |
24
24
  | `source_uri` | String | Original location |
25
- | `source_metadata` | Hash | Additional metadata (JSONB) |
25
+ | `metadata` | Hash | Additional metadata (JSONB) |
26
26
  | `embedding` | Vector | Semantic search vector |
27
27
  | `captured_at` | DateTime | When content was captured |
28
28
  | `created_at` | DateTime | Record creation time |
@@ -49,7 +49,7 @@ before_create :generate_embedding
49
49
  def compute_hash
50
50
  ```
51
51
 
52
- Computes SHA256 hash of raw_text for deduplication.
52
+ Computes SHA256 hash of content for deduplication.
53
53
 
54
54
  ### generate_embedding
55
55
 
@@ -67,30 +67,30 @@ Generates embedding vector using configured generator.
67
67
  def self.find_or_create_by_text(text, **attributes)
68
68
  ```
69
69
 
70
- Find existing content by hash or create new.
70
+ Find existing source by hash or create new.
71
71
 
72
72
  **Example:**
73
73
 
74
74
  ```ruby
75
- content = Content.find_or_create_by_text(
75
+ source = Source.find_or_create_by_text(
76
76
  "Document text",
77
- content_type: "document",
77
+ kind: "document",
78
78
  captured_at: Time.current
79
79
  )
80
80
  ```
81
81
 
82
82
  ## Scopes
83
83
 
84
- ### by_type
84
+ ### by_kind
85
85
 
86
86
  ```ruby
87
- scope :by_type, ->(type) { where(content_type: type) }
87
+ scope :by_kind, ->(kind) { where(kind: kind) }
88
88
  ```
89
89
 
90
- Filter by content type.
90
+ Filter by content kind.
91
91
 
92
92
  ```ruby
93
- Content.by_type('email')
93
+ Source.by_kind('email')
94
94
  ```
95
95
 
96
96
  ### captured_between
@@ -104,35 +104,35 @@ scope :captured_between, ->(from, to) {
104
104
  Filter by capture date range.
105
105
 
106
106
  ```ruby
107
- Content.captured_between(1.week.ago, Time.current)
107
+ Source.captured_between(1.week.ago, Time.current)
108
108
  ```
109
109
 
110
110
  ### search_text
111
111
 
112
112
  ```ruby
113
113
  scope :search_text, ->(query) {
114
- where("raw_text @@ plainto_tsquery(?)", query)
114
+ where("content @@ plainto_tsquery(?)", query)
115
115
  }
116
116
  ```
117
117
 
118
118
  Full-text search.
119
119
 
120
120
  ```ruby
121
- Content.search_text("quarterly earnings")
121
+ Source.search_text("quarterly earnings")
122
122
  ```
123
123
 
124
124
  ## Usage Examples
125
125
 
126
- ### Create Content
126
+ ### Create Source
127
127
 
128
128
  ```ruby
129
- content = Content.create!(
130
- raw_text: "Important document...",
131
- content_type: "document",
129
+ source = Source.create!(
130
+ content: "Important document...",
131
+ kind: "document",
132
132
  title: "Q4 Report",
133
133
  source_uri: "https://example.com/report.pdf",
134
134
  captured_at: Time.current,
135
- source_metadata: {
135
+ metadata: {
136
136
  author: "Jane Smith",
137
137
  department: "Finance"
138
138
  }
@@ -143,14 +143,14 @@ content = Content.create!(
143
143
 
144
144
  ```ruby
145
145
  hash = Digest::SHA256.hexdigest("Document text")
146
- content = Content.find_by(content_hash: hash)
146
+ source = Source.find_by(content_hash: hash)
147
147
  ```
148
148
 
149
149
  ### Get Related Facts
150
150
 
151
151
  ```ruby
152
- content.facts.each do |fact|
153
- puts fact.fact_text
152
+ source.facts.each do |fact|
153
+ puts fact.text
154
154
  end
155
155
  ```
156
156
 
@@ -158,7 +158,7 @@ end
158
158
 
159
159
  ```ruby
160
160
  # Requires embedding
161
- similar = Content
161
+ similar = Source
162
162
  .where.not(embedding: nil)
163
163
  .order(Arel.sql("embedding <=> '#{query_embedding}'"))
164
164
  .limit(10)
@@ -13,14 +13,14 @@ pipeline = FactDb::Pipeline::ExtractionPipeline.new(config)
13
13
  ### process
14
14
 
15
15
  ```ruby
16
- def process(contents, extractor: config.default_extractor)
16
+ def process(sources, extractor: config.default_extractor)
17
17
  ```
18
18
 
19
- Process content items sequentially.
19
+ Process source items sequentially.
20
20
 
21
21
  **Parameters:**
22
22
 
23
- - `contents` (Array<Content>) - Content records
23
+ - `sources` (Array<Source>) - Source records
24
24
  - `extractor` (Symbol) - Extraction method
25
25
 
26
26
  **Returns:** `Array<Hash>`
@@ -28,8 +28,8 @@ Process content items sequentially.
28
28
  **Example:**
29
29
 
30
30
  ```ruby
31
- contents = Models::Content.where(id: [1, 2, 3])
32
- results = pipeline.process(contents, extractor: :llm)
31
+ sources = Models::Source.where(id: [1, 2, 3])
32
+ results = pipeline.process(sources, extractor: :llm)
33
33
  ```
34
34
 
35
35
  ---
@@ -37,14 +37,14 @@ results = pipeline.process(contents, extractor: :llm)
37
37
  ### process_parallel
38
38
 
39
39
  ```ruby
40
- def process_parallel(contents, extractor: config.default_extractor)
40
+ def process_parallel(sources, extractor: config.default_extractor)
41
41
  ```
42
42
 
43
- Process content items concurrently.
43
+ Process source items concurrently.
44
44
 
45
45
  **Parameters:**
46
46
 
47
- - `contents` (Array<Content>) - Content records
47
+ - `sources` (Array<Source>) - Source records
48
48
  - `extractor` (Symbol) - Extraction method
49
49
 
50
50
  **Returns:** `Array<Hash>`
@@ -52,10 +52,10 @@ Process content items concurrently.
52
52
  **Example:**
53
53
 
54
54
  ```ruby
55
- results = pipeline.process_parallel(contents, extractor: :llm)
55
+ results = pipeline.process_parallel(sources, extractor: :llm)
56
56
 
57
57
  results.each do |result|
58
- puts "Content #{result[:content_id]}:"
58
+ puts "Source #{result[:source_id]}:"
59
59
  puts " Facts: #{result[:facts].count}"
60
60
  puts " Error: #{result[:error]}" if result[:error]
61
61
  end
@@ -67,7 +67,7 @@ end
67
67
 
68
68
  ```mermaid
69
69
  graph LR
70
- A[Content] --> B[Validate]
70
+ A[Source] --> B[Validate]
71
71
  B --> C[Extract]
72
72
  C --> D[Validate Facts]
73
73
  D --> E[Results]
@@ -79,7 +79,7 @@ graph LR
79
79
  style E fill:#B91C1C,stroke:#991B1B,color:#FFFFFF
80
80
  ```
81
81
 
82
- 1. **Validate** - Check content is not empty
82
+ 1. **Validate** - Check source is not empty
83
83
  2. **Extract** - Run extractor
84
84
  3. **Validate Facts** - Filter valid facts
85
85
  4. **Results** - Return extracted facts
@@ -89,9 +89,9 @@ graph LR
89
89
  ```mermaid
90
90
  graph TB
91
91
  subgraph Parallel
92
- A1[Content 1] --> E1[Extract 1]
93
- A2[Content 2] --> E2[Extract 2]
94
- A3[Content 3] --> E3[Extract 3]
92
+ A1[Source 1] --> E1[Extract 1]
93
+ A2[Source 2] --> E2[Extract 2]
94
+ A3[Source 3] --> E3[Extract 3]
95
95
  end
96
96
  E1 --> Aggregate
97
97
  E2 --> Aggregate
@@ -110,22 +110,22 @@ graph TB
110
110
 
111
111
  ```ruby
112
112
  {
113
- content_id: 123,
113
+ source_id: 123,
114
114
  facts: [<Fact>, <Fact>, ...], # Extracted facts
115
115
  error: nil # Error message if failed
116
116
  }
117
117
  ```
118
118
 
119
- ## Usage via Facts
119
+ ## Usage via FactDb
120
120
 
121
121
  ```ruby
122
122
  facts = FactDb.new
123
123
 
124
124
  # Sequential
125
- results = facts.batch_extract(content_ids, parallel: false)
125
+ results = facts.batch_extract(source_ids, parallel: false)
126
126
 
127
127
  # Parallel (default)
128
- results = facts.batch_extract(content_ids, parallel: true)
128
+ results = facts.batch_extract(source_ids, parallel: true)
129
129
  ```
130
130
 
131
131
  ## Error Handling
@@ -133,13 +133,13 @@ results = facts.batch_extract(content_ids, parallel: true)
133
133
  The pipeline catches errors per-item:
134
134
 
135
135
  ```ruby
136
- results = pipeline.process_parallel(contents)
136
+ results = pipeline.process_parallel(sources)
137
137
 
138
138
  results.each do |result|
139
139
  if result[:error]
140
- logger.error "Content #{result[:content_id]}: #{result[:error]}"
140
+ logger.error "Source #{result[:source_id]}: #{result[:error]}"
141
141
  else
142
- logger.info "Content #{result[:content_id]}: #{result[:facts].count} facts"
142
+ logger.info "Source #{result[:source_id]}: #{result[:facts].count} facts"
143
143
  end
144
144
  end
145
145
  ```
@@ -151,12 +151,12 @@ end
151
151
  Optimal batch size depends on:
152
152
 
153
153
  - Extractor type (LLM has rate limits)
154
- - Content length
154
+ - Source length
155
155
  - System resources
156
156
 
157
157
  ```ruby
158
158
  # Process in optimal batches
159
- contents.each_slice(25) do |batch|
159
+ sources.each_slice(25) do |batch|
160
160
  results = pipeline.process_parallel(batch)
161
161
  process_results(results)
162
162
  end
@@ -167,7 +167,7 @@ end
167
167
  For large batches, process and discard:
168
168
 
169
169
  ```ruby
170
- contents.each_slice(50) do |batch|
170
+ sources.each_slice(50) do |batch|
171
171
  results = pipeline.process_parallel(batch)
172
172
  save_facts(results.flat_map { |r| r[:facts] })
173
173
  # Results discarded after each batch
@@ -50,7 +50,7 @@ Pipeline results follow a consistent format:
50
50
 
51
51
  ```ruby
52
52
  {
53
- content_id: 123, # Item identifier
53
+ source_id: 123, # Item identifier
54
54
  facts: [<Fact>, ...], # Extracted/resolved items
55
55
  error: nil # Error message if failed
56
56
  }
@@ -33,7 +33,7 @@ results = pipeline.resolve_entities(names)
33
33
 
34
34
  results.each do |result|
35
35
  puts "#{result[:name]}: #{result[:status]}"
36
- puts " Entity: #{result[:entity]&.canonical_name}"
36
+ puts " Entity: #{result[:entity]&.name}"
37
37
  end
38
38
  ```
39
39
 
@@ -61,7 +61,7 @@ results = pipeline.detect_conflicts([paula.id, john.id])
61
61
  results.each do |result|
62
62
  puts "Entity #{result[:entity_id]}: #{result[:conflict_count]} conflicts"
63
63
  result[:conflicts].each do |c|
64
- puts " - #{c[:fact1].fact_text} vs #{c[:fact2].fact_text}"
64
+ puts " - #{c[:fact1].text} vs #{c[:fact2].text}"
65
65
  end
66
66
  end
67
67
  ```
@@ -153,7 +153,7 @@ results = facts.detect_fact_conflicts([entity1.id, entity2.id])
153
153
 
154
154
  The pipeline uses the EntityResolver which tries:
155
155
 
156
- 1. **Exact match** on canonical name
156
+ 1. **Exact match** on name
157
157
  2. **Alias match** on registered aliases
158
158
  3. **Fuzzy match** using Levenshtein distance
159
159
 
@@ -202,7 +202,7 @@ end
202
202
 
203
203
  ```ruby
204
204
  # Load entities into memory first
205
- Entity.where(entity_type: 'person').to_a
205
+ Entity.where(type: 'person').to_a
206
206
 
207
207
  # Then resolve
208
208
  results = pipeline.resolve_entities(person_names, type: :person)