legion-apollo 0.5.0 → 0.5.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: 37c9cddf7269f0a38272eeedb3fe3f3abb50440271b4ec9686820307233fe12f
4
- data.tar.gz: 28962732f0a157e4e6f5abbf2ad494cd4d9fdb56f9d3723de3d074710143118e
3
+ metadata.gz: 9320cef3392467f9819e47c9cacc3893f45ffab9570825cb5d6b270ff3ce6a67
4
+ data.tar.gz: d38343aa96c9ba0b1d634f6e4d3850e22c899fe81c13550dce19809cec290a94
5
5
  SHA512:
6
- metadata.gz: f1a1999ff0d3731d67af120efc58c215faeccc6a278ba02c11bfbc84f45dfb50520556e7966bdc6ee9a29a7744cdd61ac3cbc31f99b5112a0c7c3a82e2011141
7
- data.tar.gz: f4421e4ace8b3e735ae78012e8579a052ee054f2f1a72160cf401f47fcf74fef3ab96925d567378804fadcd84f290c774cdaf73ecc24d94f8fdf23427b9b76bf
6
+ metadata.gz: 2dcaf3fc8fbf087b8e59c079e71921aebf8c53620e32d9cd8939a00981474747b78e978412d28a5c3ff4f204df01b9538f23a81c82a76a0d94afab8080219e46
7
+ data.tar.gz: 8d078c7e1c727bd23b08bfdb09af0abe7581296132d715b7ed7c4c5205d4645c004f1f980af6546239961c6649e9433e66a725341e2275e3cdb2e1de7f1687a3
data/CHANGELOG.md CHANGED
@@ -1,5 +1,19 @@
1
1
  # Changelog
2
2
 
3
+ ## [0.5.2] - 2026-04-27
4
+
5
+ ### Added
6
+ - Store `raw_content` alongside indexed `content` in Apollo Local so callers can preserve verbatim source text separately from retrieval text (#25, #26)
7
+ - Add `valid_from`/`valid_to` temporal windows and `as_of:` query filtering for local knowledge entries (#27)
8
+
9
+ ### Fixed
10
+ - Sanitize Apollo ingest and query text by scrubbing invalid UTF-8 and removing null bytes before routing to local or global backends (#29)
11
+
12
+ ## [0.5.1] - 2026-04-27
13
+
14
+ ### Fixed
15
+ - Guard Apollo Local tag queries and promotion against nil, shutdown, or unavailable local DB connections before SQL and Ruby fallback paths (#30)
16
+
3
17
  ## [0.5.0] - 2026-04-18
4
18
 
5
19
  ### Added
data/README.md CHANGED
@@ -1,10 +1,10 @@
1
1
  # legion-apollo
2
2
 
3
- Apollo client library for the LegionIO framework.
3
+ Apollo is the LegionIO knowledge client. It gives extensions one API for writing, retrieving, and merging knowledge across the global Apollo service and the node-local SQLite store.
4
4
 
5
- **Version**: 0.3.2
5
+ **Version**: 0.5.2
6
6
 
7
- Provides `query`, `ingest`, and `retrieve` with smart routing: co-located lex-apollo service, RabbitMQ transport, or graceful failure. Supports a node-local SQLite knowledge store (`Apollo::Local`) that mirrors the same API without requiring any remote infrastructure.
7
+ `legion-apollo` provides `query`, `ingest`, and `retrieve` with smart routing: co-located `lex-apollo`, RabbitMQ transport, node-local SQLite, or graceful failure. `Apollo::Local` mirrors the same public API for offline and low-latency retrieval without requiring remote infrastructure.
8
8
 
9
9
  ## Usage
10
10
 
@@ -21,6 +21,24 @@ results = Legion::Apollo.query(text: 'local note', scope: :local)
21
21
 
22
22
  # Query both and merge (deduped by content hash, ranked by confidence)
23
23
  results = Legion::Apollo.query(text: 'ruby', scope: :all)
24
+
25
+ # Preserve verbatim source text separately from indexed retrieval content
26
+ Legion::Apollo.ingest(
27
+ content: 'Summarized policy note for search',
28
+ raw_content: 'Exact source text from the original record',
29
+ tags: %w[policy source],
30
+ scope: :local
31
+ )
32
+
33
+ # Query the local store as it was valid at a point in time
34
+ Legion::Apollo.ingest(
35
+ content: 'Policy version active in Q2',
36
+ tags: %w[policy],
37
+ valid_from: '2026-04-01T00:00:00.000Z',
38
+ valid_to: '2026-06-30T23:59:59.999Z',
39
+ scope: :local
40
+ )
41
+ results = Legion::Apollo.query(text: 'policy', scope: :local, as_of: '2026-05-01T00:00:00.000Z')
24
42
  ```
25
43
 
26
44
  ## Scopes
@@ -37,9 +55,12 @@ results = Legion::Apollo.query(text: 'ruby', scope: :all)
37
55
 
38
56
  Features:
39
57
  - Content-hash dedup (MD5 of normalized content)
58
+ - `raw_content` preservation for verbatim source text
59
+ - `valid_from` / `valid_to` temporal windows with `as_of:` query filtering
40
60
  - Optional LLM embeddings (1024-dim) with cosine rerank when `Legion::LLM.can_embed?`
41
61
  - TTL expiry (default 5-year retention)
42
62
  - FTS5 full-text search with `ILIKE` fallback
63
+ - Null-byte removal and invalid UTF-8 scrubbing before persistence or backend routing
43
64
 
44
65
  ## Configuration
45
66
 
@@ -0,0 +1,22 @@
1
+ # frozen_string_literal: true
2
+
3
+ Sequel.migration do
4
+ up do
5
+ alter_table(:local_knowledge) do
6
+ add_column :raw_content, :text, null: true
7
+ add_column :valid_from, String, null: true
8
+ add_column :valid_to, String, null: true
9
+
10
+ add_index :valid_from, name: :idx_local_knowledge_valid_from
11
+ add_index :valid_to, name: :idx_local_knowledge_valid_to
12
+ end
13
+ end
14
+
15
+ down do
16
+ alter_table(:local_knowledge) do
17
+ drop_column :raw_content
18
+ drop_column :valid_from
19
+ drop_column :valid_to
20
+ end
21
+ end
22
+ end
@@ -99,18 +99,19 @@ module Legion
99
99
  limit ||= local_setting(:default_limit, 5)
100
100
  min_confidence ||= local_setting(:min_confidence, 0.3)
101
101
  multiplier = local_setting(:fts_candidate_multiplier, 3)
102
+ as_of = normalize_temporal_value(opts[:as_of])
102
103
  log.info do
103
104
  "Apollo::Local query executing text_length=#{text.to_s.length} " \
104
105
  "limit=#{limit} min_confidence=#{min_confidence} tag_count=#{Array(tags).size}"
105
106
  end
106
107
  log.debug { "Apollo::Local query limit=#{limit} min_confidence=#{min_confidence} tags=#{Array(tags).size}" }
107
108
 
108
- candidates = fts_search(text, limit: limit * multiplier)
109
+ candidates = fts_search(text, limit: limit * multiplier, as_of: as_of)
109
110
  include_inferences = opts.fetch(:include_inferences, true)
110
111
  include_history = opts.fetch(:include_history, false)
111
112
  candidates = filter_candidates(candidates, min_confidence: min_confidence, tags: tags,
112
- include_inferences: include_inferences,
113
- include_history: include_history)
113
+ options: { include_inferences: include_inferences,
114
+ include_history: include_history, as_of: as_of })
114
115
  candidates = cosine_rerank(text, candidates) if can_rerank?
115
116
  results = candidates.first(limit)
116
117
 
@@ -159,10 +160,11 @@ module Legion
159
160
  end
160
161
 
161
162
  def query_by_tags(tags:, limit: 50) # rubocop:disable Metrics/MethodLength
162
- return { success: false, error: :not_started } unless started?
163
-
163
+ connection = local_db_connection
164
164
  tags = normalize_tags_input(tags)
165
- results = query_by_tags_via_sql(tags: tags, limit: limit)
165
+ return { success: false, error: :not_started } unless local_db_usable?(connection)
166
+
167
+ results = query_by_tags_via_sql(connection, tags: tags, limit: limit)
166
168
 
167
169
  log.info { "Apollo::Local query_by_tags completed tag_count=#{tags.size} count=#{results.size}" }
168
170
  { success: true, results: results, count: results.size }
@@ -178,11 +180,13 @@ module Legion
178
180
  end
179
181
 
180
182
  def promote_to_global(tags:, min_confidence: 0.6) # rubocop:disable Metrics/MethodLength,Metrics/AbcSize,Metrics/CyclomaticComplexity,Metrics/PerceivedComplexity
181
- return { success: false, error: :not_started } unless started?
183
+ return { success: false, error: :not_started } unless local_db_usable?(local_db_connection)
182
184
 
183
185
  tags = normalize_tags_input(tags)
184
186
  entries = query_by_tags(tags: tags)
185
- unless entries[:success] && entries[:results]&.any?
187
+ return entries unless entries[:success]
188
+
189
+ unless entries[:results]&.any?
186
190
  log.info { "Apollo::Local promote_to_global skipped tag_count=#{tags.size} reason=no_entries" }
187
191
  return { success: true, promoted: 0 }
188
192
  end
@@ -200,6 +204,7 @@ module Legion
200
204
  end
201
205
  result = Legion::Apollo.ingest(
202
206
  content: entry[:content],
207
+ raw_content: entry[:raw_content] || entry[:content],
203
208
  tags: entry_tags + ['promoted_from_local'],
204
209
  source_channel: 'local_promotion',
205
210
  submitted_by: "node:#{hostname}",
@@ -337,6 +342,26 @@ module Legion
337
342
  Legion::Data::Local.connection
338
343
  end
339
344
 
345
+ def local_db_connection
346
+ return nil unless started? && data_local_available?
347
+
348
+ db
349
+ rescue StandardError => e
350
+ handle_exception(e, level: :debug, operation: 'apollo.local.local_db_connection')
351
+ nil
352
+ end
353
+
354
+ def local_db_usable?(connection)
355
+ return false unless started? && connection
356
+ return false if connection.respond_to?(:closed?) && connection.closed?
357
+
358
+ connection.test_connection if connection.respond_to?(:test_connection)
359
+ true
360
+ rescue StandardError => e
361
+ handle_exception(e, level: :debug, operation: 'apollo.local.local_db_usable')
362
+ false
363
+ end
364
+
340
365
  def content_hash(content)
341
366
  normalized = content.to_s.strip.downcase.gsub(/\s+/, ' ')
342
367
  Digest::MD5.hexdigest(normalized)
@@ -396,9 +421,12 @@ module Legion
396
421
 
397
422
  result = ingest(
398
423
  content: entry[:content],
424
+ raw_content: entry[:raw_content] || entry[:content],
399
425
  tags: clean_tags,
400
426
  confidence: ((entry[:confidence] || 0.5) * 0.9).round(10),
401
- source_channel: 'global_hydration'
427
+ source_channel: 'global_hydration',
428
+ valid_from: entry[:valid_from],
429
+ valid_to: entry[:valid_to]
402
430
  )
403
431
  hydrated += 1 if result[:success]
404
432
  end
@@ -408,6 +436,8 @@ module Legion
408
436
  end
409
437
 
410
438
  def ingest_without_lock(content:, tags:, **opts) # rubocop:disable Metrics/MethodLength,Metrics/AbcSize
439
+ content = normalize_text_input(content)
440
+ raw_content = normalize_text_input(opts.key?(:raw_content) ? opts[:raw_content] : content)
411
441
  hash = content_hash(content)
412
442
  return deduplicated_ingest(hash) if duplicate?(hash)
413
443
 
@@ -417,9 +447,11 @@ module Legion
417
447
  end
418
448
  log.debug { "Apollo::Local ingest hash=#{hash} tags=#{Array(tags).size} source_channel=#{opts[:source_channel]}" }
419
449
 
420
- row = build_ingest_row(content: content, hash: hash, tags: tags, **opts)
421
- id = persist_ingest_row(row, opts)
422
- mark_parent_superseded(opts[:parent_knowledge_id]) if opts[:parent_knowledge_id]
450
+ metadata = opts.dup
451
+ metadata.delete(:raw_content)
452
+ row = build_ingest_row(content: content, raw_content: raw_content, hash: hash, tags: tags, **metadata)
453
+ id = persist_ingest_row(row, metadata)
454
+ mark_parent_superseded(metadata[:parent_knowledge_id]) if metadata[:parent_knowledge_id]
423
455
 
424
456
  log.info { "Apollo::Local ingest stored id=#{id} hash=#{hash}" }
425
457
  { success: true, mode: :local, id: id }
@@ -429,22 +461,56 @@ module Legion
429
461
  deduplicated_ingest(hash)
430
462
  end
431
463
 
432
- def build_ingest_row(content:, hash:, tags:, **opts) # rubocop:disable Metrics/MethodLength
464
+ def build_ingest_row(content:, raw_content:, hash:, tags:, **opts) # rubocop:disable Metrics/MethodLength
433
465
  is_inference = opts[:is_inference] == true
434
466
  default_confidence = is_inference ? Legion::Apollo::Helpers::Confidence::INITIAL_INFERENCE_CONFIDENCE : 1.0
467
+ ingest_metadata_columns(
468
+ content: content,
469
+ raw_content: raw_content,
470
+ hash: hash,
471
+ tags: tags,
472
+ opts: opts,
473
+ is_inference: is_inference,
474
+ default_confidence: default_confidence
475
+ ).merge(embedding_columns(content, opts)).merge(timestamp_columns)
476
+ end
477
+
478
+ def ingest_metadata_columns(context)
479
+ ingest_base_columns(context)
480
+ .merge(ingest_lineage_columns(context[:opts]))
481
+ .merge(ingest_temporal_columns(context[:opts]))
482
+ end
483
+
484
+ def ingest_base_columns(context)
485
+ opts = context[:opts]
486
+ {
487
+ content: context[:content],
488
+ raw_content: context[:raw_content],
489
+ content_hash: context[:hash],
490
+ tags: serialized_tags(context[:tags]),
491
+ confidence: opts[:confidence] || context[:default_confidence],
492
+ is_inference: context[:is_inference]
493
+ }.merge(ingest_source_columns(opts))
494
+ end
495
+
496
+ def ingest_source_columns(opts)
497
+ { source_channel: opts[:source_channel], source_agent: opts[:source_agent],
498
+ submitted_by: opts[:submitted_by] }
499
+ end
500
+
501
+ def ingest_lineage_columns(opts)
435
502
  {
436
- content: content,
437
- content_hash: hash,
438
- tags: serialized_tags(tags),
439
- source_channel: opts[:source_channel],
440
- source_agent: opts[:source_agent],
441
- submitted_by: opts[:submitted_by],
442
- confidence: opts[:confidence] || default_confidence,
443
- is_inference: is_inference,
444
503
  forget_reason: opts[:forget_reason],
445
504
  parent_knowledge_id: opts[:parent_knowledge_id],
446
505
  supersession_type: opts[:supersession_type]
447
- }.merge(embedding_columns(content, opts)).merge(timestamp_columns)
506
+ }
507
+ end
508
+
509
+ def ingest_temporal_columns(opts)
510
+ {
511
+ valid_from: normalize_temporal_value(opts[:valid_from]),
512
+ valid_to: normalize_temporal_value(opts[:valid_to])
513
+ }
448
514
  end
449
515
 
450
516
  def persist_ingest_row(row, opts = {})
@@ -539,41 +605,41 @@ module Legion
539
605
  Legion::JSON.dump(normalize_tags_input(tags))
540
606
  end
541
607
 
542
- def fts_search(text, limit:) # rubocop:disable Metrics/MethodLength,Metrics/AbcSize
608
+ def fts_search(text, limit:, as_of: nil) # rubocop:disable Metrics/MethodLength,Metrics/AbcSize
543
609
  now = Time.now.utc.strftime('%Y-%m-%dT%H:%M:%S.%LZ')
544
- if text.to_s.strip.empty?
545
- return db[:local_knowledge]
546
- .where(Sequel.lit('expires_at > ?', now))
547
- .limit(limit)
548
- .all
549
- end
610
+ return active_knowledge_dataset(now: now, as_of: as_of).limit(limit).all if text.to_s.strip.empty?
550
611
 
551
612
  tokens = text.to_s.scan(/[\p{L}\p{N}_]+/)
552
- return ilike_search(text, now: now, limit: limit) if tokens.empty?
613
+ return ilike_search(text, now: now, limit: limit, as_of: as_of) if tokens.empty?
553
614
 
554
615
  escaped = tokens.map { |t| %("#{t}") }.join(' ')
616
+ temporal_sql, temporal_params = temporal_window_sql(as_of, table_alias: 'lk')
555
617
  db.fetch(
556
618
  'SELECT lk.* FROM local_knowledge lk ' \
557
619
  'INNER JOIN local_knowledge_fts fts ON lk.id = fts.rowid ' \
558
- 'WHERE local_knowledge_fts MATCH ? AND lk.expires_at > ? ORDER BY fts.rank LIMIT ?',
559
- escaped, now, limit
620
+ "WHERE local_knowledge_fts MATCH ? AND lk.expires_at > ?#{temporal_sql} " \
621
+ 'ORDER BY fts.rank LIMIT ?',
622
+ escaped, now, *temporal_params, limit
560
623
  ).all
561
624
  rescue StandardError => e
562
625
  handle_exception(e, level: :debug, operation: 'apollo.local.fts_search', limit: limit, fallback: :ilike)
563
- ilike_search(text, now: Time.now.utc.strftime('%Y-%m-%dT%H:%M:%S.%LZ'), limit: limit)
626
+ ilike_search(text, now: Time.now.utc.strftime('%Y-%m-%dT%H:%M:%S.%LZ'), limit: limit, as_of: as_of)
564
627
  end
565
628
 
566
- def ilike_search(text, now:, limit:)
629
+ def ilike_search(text, now:, limit:, as_of: nil)
567
630
  safe_text = text.to_s.gsub('\\', '\\\\\\\\').gsub('%', '\%').gsub('_', '\_')
568
- db[:local_knowledge]
569
- .where(Sequel.lit('expires_at > ?', now))
631
+ active_knowledge_dataset(now: now, as_of: as_of)
570
632
  .where(Sequel.lit("content LIKE ? ESCAPE '\\' COLLATE NOCASE", "%#{safe_text}%"))
571
633
  .limit(limit)
572
634
  .all
573
635
  end
574
636
 
575
- def filter_candidates(candidates, min_confidence:, tags:, include_inferences: true, include_history: false) # rubocop:disable Metrics/CyclomaticComplexity,Metrics/PerceivedComplexity,Metrics/MethodLength,Metrics/AbcSize
637
+ def filter_candidates(candidates, min_confidence:, tags:, options: {}) # rubocop:disable Metrics/CyclomaticComplexity,Metrics/PerceivedComplexity,Metrics/MethodLength,Metrics/AbcSize
638
+ include_inferences = options.fetch(:include_inferences, true)
639
+ include_history = options.fetch(:include_history, false)
640
+ as_of = options[:as_of]
576
641
  candidates = candidates.select { |c| (c[:confidence] || 0) >= min_confidence }
642
+ candidates = candidates.select { |c| temporally_valid?(c, as_of) }
577
643
  candidates = candidates.reject { |c| [1, true].include?(c[:is_inference]) } unless include_inferences
578
644
  unless include_history
579
645
  candidates = candidates.select { |c| c[:is_latest].nil? || c[:is_latest] == 1 || c[:is_latest] == true }
@@ -588,6 +654,36 @@ module Legion
588
654
  candidates
589
655
  end
590
656
 
657
+ def active_knowledge_dataset(now:, as_of: nil)
658
+ apply_temporal_window(db[:local_knowledge].where(Sequel.lit('expires_at > ?', now)), as_of)
659
+ end
660
+
661
+ def apply_temporal_window(dataset, as_of)
662
+ return dataset if as_of.to_s.empty?
663
+
664
+ dataset.where(
665
+ Sequel.lit('(valid_from IS NULL OR valid_from <= ?) AND (valid_to IS NULL OR valid_to >= ?)', as_of, as_of)
666
+ )
667
+ end
668
+
669
+ def temporal_window_sql(as_of, table_alias:)
670
+ return ['', []] if as_of.to_s.empty?
671
+
672
+ [
673
+ " AND (#{table_alias}.valid_from IS NULL OR #{table_alias}.valid_from <= ?) " \
674
+ "AND (#{table_alias}.valid_to IS NULL OR #{table_alias}.valid_to >= ?)",
675
+ [as_of, as_of]
676
+ ]
677
+ end
678
+
679
+ def temporally_valid?(row, as_of)
680
+ return true if as_of.to_s.empty?
681
+
682
+ valid_from = row[:valid_from]
683
+ valid_to = row[:valid_to]
684
+ (valid_from.nil? || valid_from <= as_of) && (valid_to.nil? || valid_to >= as_of)
685
+ end
686
+
591
687
  def parse_tags(tags_json)
592
688
  return [] if tags_json.nil? || tags_json.empty?
593
689
 
@@ -656,6 +752,17 @@ module Legion
656
752
  value.to_s
657
753
  end
658
754
 
755
+ def normalize_temporal_value(value)
756
+ return nil if value.nil?
757
+
758
+ text = value.respond_to?(:utc) ? value.utc.strftime('%Y-%m-%dT%H:%M:%S.%LZ') : value.to_s.strip
759
+ return nil if text.empty?
760
+
761
+ Time.parse(text).utc.strftime('%Y-%m-%dT%H:%M:%S.%LZ')
762
+ rescue StandardError
763
+ text
764
+ end
765
+
659
766
  def normalize_tags_input(tags)
660
767
  Legion::Apollo::Helpers::TagNormalizer.normalize(Array(tags)).first(max_tags_limit)
661
768
  rescue StandardError => e
@@ -674,9 +781,9 @@ module Legion
674
781
  Legion::Apollo::Helpers::TagNormalizer::MAX_TAGS
675
782
  end
676
783
 
677
- def query_by_tags_via_sql(tags:, limit:) # rubocop:disable Metrics/AbcSize,Metrics/MethodLength
784
+ def query_by_tags_via_sql(connection, tags:, limit:) # rubocop:disable Metrics/AbcSize,Metrics/MethodLength
678
785
  now = Time.now.utc.strftime('%Y-%m-%dT%H:%M:%S.%LZ')
679
- dataset = db[:local_knowledge].where(Sequel.lit('expires_at > ?', now))
786
+ dataset = connection[:local_knowledge].where(Sequel.lit('expires_at > ?', now))
680
787
 
681
788
  Array(tags).map(&:to_s).each do |tag|
682
789
  dataset = dataset.where(
@@ -696,11 +803,15 @@ module Legion
696
803
  tag_count: Array(tags).size,
697
804
  limit: limit
698
805
  )
699
- query_by_tags_via_ruby(tags: tags, limit: limit)
806
+ raise unless local_db_usable?(connection)
807
+
808
+ query_by_tags_via_ruby(connection, tags: tags, limit: limit)
700
809
  end
701
810
 
702
- def query_by_tags_via_ruby(tags:, limit:)
703
- candidates = db[:local_knowledge]
811
+ def query_by_tags_via_ruby(connection, tags:, limit:)
812
+ raise Sequel::DatabaseConnectionError, 'local database unavailable' unless local_db_usable?(connection)
813
+
814
+ candidates = connection[:local_knowledge]
704
815
  .where(Sequel.lit('expires_at > ?', Time.now.utc.strftime('%Y-%m-%dT%H:%M:%S.%LZ')))
705
816
  .all
706
817
 
@@ -711,7 +822,7 @@ module Legion
711
822
  end
712
823
 
713
824
  def update_upsert_entry(existing, content, tags_json, opts) # rubocop:disable Metrics/MethodLength,Metrics/AbcSize
714
- content = content.to_s
825
+ content = normalize_text_input(content)
715
826
  new_hash = content_hash(content)
716
827
  embedding, embedded_at = generate_embedding(content)
717
828
  now = Time.now.utc.strftime('%Y-%m-%dT%H:%M:%S.%LZ')
@@ -68,6 +68,7 @@ module Legion
68
68
  agent_id: body[:agent_id] || 'api',
69
69
  scope: normalize_scope(body[:scope]),
70
70
  tier: body[:tier]&.to_sym,
71
+ as_of: body[:as_of],
71
72
  include_inferences: body.fetch(:include_inferences, true),
72
73
  include_history: body.fetch(:include_history, false)
73
74
  )
@@ -88,6 +89,7 @@ module Legion
88
89
  tags = Legion::Apollo::Helpers::TagNormalizer.normalize(Array(body[:tags])).first(effective_max_tags)
89
90
  result = Legion::Apollo.ingest(
90
91
  content: body[:content],
92
+ raw_content: body[:raw_content],
91
93
  content_type: body[:content_type] || :observation,
92
94
  tags: tags,
93
95
  source_agent: body[:source_agent] || 'api',
@@ -99,6 +101,8 @@ module Legion
99
101
  is_inference: body[:is_inference] == true,
100
102
  forget_reason: body[:forget_reason],
101
103
  expires_at: body[:expires_at],
104
+ valid_from: body[:valid_from],
105
+ valid_to: body[:valid_to],
102
106
  parent_knowledge_id: body[:parent_knowledge_id],
103
107
  supersession_type: body[:supersession_type],
104
108
  source_uri: body[:source_uri],
@@ -2,6 +2,6 @@
2
2
 
3
3
  module Legion
4
4
  module Apollo
5
- VERSION = '0.5.0'
5
+ VERSION = '0.5.2'
6
6
  end
7
7
  end
data/lib/legion/apollo.rb CHANGED
@@ -97,9 +97,11 @@ module Legion
97
97
  return not_started_error unless started?
98
98
 
99
99
  normalized_tags = normalize_tags_input(tags)
100
- payload = { content: content, tags: normalized_tags, **opts }
100
+ normalized_content = normalize_text_input(content)
101
+ normalized_raw_content = normalize_text_input(opts.key?(:raw_content) ? opts[:raw_content] : content)
102
+ payload = { **opts, content: normalized_content, raw_content: normalized_raw_content, tags: normalized_tags }
101
103
  log.info do
102
- "Apollo ingest requested scope=#{scope} content_length=#{content.to_s.length} " \
104
+ "Apollo ingest requested scope=#{scope} content_length=#{payload[:content].to_s.length} " \
103
105
  "tags=#{payload[:tags].size} source_channel=#{payload[:source_channel]}"
104
106
  end
105
107
  log.debug do
@@ -289,7 +291,8 @@ module Legion
289
291
  "limit=#{payload[:limit]}"
290
292
  end
291
293
  result = Legion::Apollo::Local.query(**payload.slice(:text, :limit, :min_confidence, :tags,
292
- :tier, :include_inferences, :include_history))
294
+ :tier, :include_inferences, :include_history,
295
+ :as_of))
293
296
  return result unless result[:success]
294
297
 
295
298
  entries = normalize_local_entries(Array(result[:results]))
@@ -324,7 +327,8 @@ module Legion
324
327
  if Legion::Apollo::Local.started?
325
328
  attempted = true
326
329
  local = Legion::Apollo::Local.query(**payload.slice(:text, :limit, :min_confidence, :tags,
327
- :tier, :include_inferences, :include_history))
330
+ :tier, :include_inferences, :include_history,
331
+ :as_of))
328
332
  if local[:success]
329
333
  any_success = true
330
334
  entries.concat(normalize_local_entries(Array(local[:results]))) if local[:results]
@@ -377,18 +381,29 @@ module Legion
377
381
  else
378
382
  Array(e[:tags])
379
383
  end
380
- { id: e[:id], content: e[:content], content_hash: hash,
381
- confidence: e[:confidence] || 0.5, content_type: 'fact', tags: tags, source: :local }
384
+ { id: e[:id], content: e[:content], raw_content: e[:raw_content] || e[:content], content_hash: hash,
385
+ confidence: e[:confidence] || 0.5, content_type: 'fact', tags: tags, source: :local,
386
+ valid_from: e[:valid_from], valid_to: e[:valid_to] }
382
387
  end
383
388
  end
384
389
 
385
390
  def normalize_global_entries(entries)
386
- entries.map do |e|
387
- hash = e[:content_hash] || Digest::MD5.hexdigest(e[:content].to_s.strip.downcase.gsub(/\s+/, ' '))
388
- { id: e[:id], content: e[:content], content_hash: hash,
389
- confidence: e[:confidence] || 0.5, content_type: e[:content_type] || 'fact',
390
- tags: Array(e[:tags]), source: :global }
391
- end
391
+ entries.map { |entry| normalize_global_entry(entry) }
392
+ end
393
+
394
+ def normalize_global_entry(entry)
395
+ { id: entry[:id], content: entry[:content], raw_content: normalized_raw_content(entry),
396
+ content_hash: normalized_content_hash(entry), confidence: entry[:confidence] || 0.5,
397
+ content_type: entry[:content_type] || 'fact', tags: Array(entry[:tags]), source: :global,
398
+ valid_from: entry[:valid_from], valid_to: entry[:valid_to] }
399
+ end
400
+
401
+ def normalized_raw_content(entry)
402
+ entry[:raw_content] || entry[:content]
403
+ end
404
+
405
+ def normalized_content_hash(entry)
406
+ entry[:content_hash] || Digest::MD5.hexdigest(entry[:content].to_s.strip.downcase.gsub(/\s+/, ' '))
392
407
  end
393
408
 
394
409
  def dedup_and_rank(entries, limit:)
@@ -470,20 +485,28 @@ module Legion
470
485
  end
471
486
 
472
487
  def normalize_text_input(value) # rubocop:disable Metrics/CyclomaticComplexity,Metrics/MethodLength
473
- case value
474
- when String
475
- value
476
- when Array
477
- parts = value.filter_map { |entry| extract_text_fragment(entry) }
478
- joined = parts.map(&:to_s).map(&:strip).reject(&:empty?).join("\n")
479
- joined.empty? ? value.to_s : joined
480
- when Hash
481
- extract_text_fragment(value).to_s
482
- when nil
483
- ''
484
- else
485
- value.to_s
486
- end
488
+ text = case value
489
+ when String
490
+ value
491
+ when Array
492
+ parts = value.filter_map { |entry| extract_text_fragment(entry) }
493
+ joined = parts.map(&:to_s).map(&:strip).reject(&:empty?).join("\n")
494
+ joined.empty? ? value.to_s : joined
495
+ when Hash
496
+ extract_text_fragment(value).to_s
497
+ when nil
498
+ ''
499
+ else
500
+ value.to_s
501
+ end
502
+ sanitize_text_input(text)
503
+ end
504
+
505
+ def sanitize_text_input(value)
506
+ text = value.to_s.dup
507
+ text = text.encode(Encoding::UTF_8, invalid: :replace, undef: :replace, replace: '')
508
+ text = text.scrub('') unless text.valid_encoding?
509
+ text.delete("\u0000")
487
510
  end
488
511
 
489
512
  def normalize_tags_input(tags)
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: legion-apollo
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.5.0
4
+ version: 0.5.2
5
5
  platform: ruby
6
6
  authors:
7
7
  - Esity
@@ -87,6 +87,7 @@ files:
87
87
  - lib/legion/apollo/local/migrations/002_create_graph_tables.rb
88
88
  - lib/legion/apollo/local/migrations/003_harden_graph_relationships.rb
89
89
  - lib/legion/apollo/local/migrations/004_add_versioning_tiers_inference.rb
90
+ - lib/legion/apollo/local/migrations/005_add_raw_content_temporal_windows.rb
90
91
  - lib/legion/apollo/messages/access_boost.rb
91
92
  - lib/legion/apollo/messages/ingest.rb
92
93
  - lib/legion/apollo/messages/query.rb