@geravant/sinain 1.15.5 → 1.18.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -246,6 +246,88 @@ def query_facts_by_entity_graph(
246
246
  return []
247
247
 
248
248
 
249
+ def expand_entity_community(
250
+ store,
251
+ entity_name: str,
252
+ max_related: int = 3,
253
+ max_facts_per_entity: int = 30,
254
+ ) -> list[tuple[str, int]]:
255
+ """Find related entities by following entity → facts → mentioned entities.
256
+
257
+ Returns [(entity_name, co_mention_count), ...] sorted by frequency.
258
+ """
259
+ entity_node_id = f"entity:{entity_name.lower().replace(' ', '-')}"
260
+ if not store.entity(entity_node_id):
261
+ return []
262
+
263
+ # Collect facts linked to this entity (both about and mentions)
264
+ fact_ids = set()
265
+ for fact_eid, _ in store.backrefs(entity_node_id, attribute="about")[:max_facts_per_entity]:
266
+ if fact_eid.startswith("fact:"):
267
+ fact_ids.add(fact_eid)
268
+ for fact_eid, _ in store.backrefs(entity_node_id, attribute="mentions")[:max_facts_per_entity]:
269
+ if fact_eid.startswith("fact:"):
270
+ fact_ids.add(fact_eid)
271
+
272
+ # Follow each fact's outgoing refs to find other entity nodes
273
+ related_counts: dict[str, int] = {}
274
+ for fact_eid in fact_ids:
275
+ attrs = store.entity(fact_eid)
276
+ for ref_attr in ("about", "mentions"):
277
+ targets = attrs.get(ref_attr, [])
278
+ if not isinstance(targets, list):
279
+ targets = [targets]
280
+ for target in targets:
281
+ if isinstance(target, str) and target.startswith("entity:") and target != entity_node_id:
282
+ name = target[len("entity:"):]
283
+ related_counts[name] = related_counts.get(name, 0) + 1
284
+
285
+ # Sort by frequency, return top N
286
+ ranked = sorted(related_counts.items(), key=lambda x: -x[1])
287
+ return ranked[:max_related]
288
+
289
+
290
+ def _cooccurring_entities(
291
+ store,
292
+ fact_ids: set[str],
293
+ max_entities: int = 3,
294
+ ) -> list[str]:
295
+ """Find entities that co-occur in the same distillation pass (shared first_seen timestamp)."""
296
+ if not fact_ids:
297
+ return []
298
+
299
+ # Get first_seen timestamps for the input facts
300
+ timestamps = set()
301
+ for fid in list(fact_ids)[:20]: # cap to avoid huge queries
302
+ attrs = store.entity(fid)
303
+ fs = attrs.get("first_seen", [])
304
+ if isinstance(fs, list) and fs:
305
+ timestamps.add(fs[0])
306
+ elif isinstance(fs, str):
307
+ timestamps.add(fs)
308
+
309
+ if not timestamps:
310
+ return []
311
+
312
+ # Find other facts with same timestamps and extract their entity names
313
+ placeholders = ",".join("?" for _ in timestamps)
314
+ rows = store._conn.execute(
315
+ f"SELECT DISTINCT t2.value FROM triples t1 "
316
+ f"JOIN triples t2 ON t2.entity_id = t1.entity_id AND t2.attribute = 'entity' AND t2.retracted = 0 "
317
+ f"WHERE t1.attribute = 'first_seen' AND t1.value IN ({placeholders}) "
318
+ f"AND t1.retracted = 0 AND t1.entity_id LIKE 'fact:%' "
319
+ f"AND t1.entity_id NOT IN ({','.join('?' for _ in fact_ids)})",
320
+ list(timestamps) + list(fact_ids),
321
+ ).fetchall()
322
+
323
+ # Count co-occurrence per entity name
324
+ counts: dict[str, int] = {}
325
+ for (name,) in rows:
326
+ counts[name] = counts.get(name, 0) + 1
327
+ ranked = sorted(counts, key=lambda x: -counts[x])
328
+ return ranked[:max_entities]
329
+
330
+
249
331
  def query_facts_hybrid(
250
332
  db_path: str,
251
333
  query: str,
@@ -257,17 +339,45 @@ def query_facts_hybrid(
257
339
  expands top results with 1-hop graph neighbors.
258
340
  """
259
341
  import re
342
+ import time
260
343
  keywords = [w.lower() for w in re.findall(r"[a-zA-Z][a-zA-Z0-9-]+", query) if len(w) > 2]
261
344
 
262
345
  # Entity graph pre-filter: find facts linked to mentioned entities via backrefs.
263
346
  # Used to BOOST relevant facts in RRF, not as a separate tier (avoids dilution).
264
347
  graph_fact_ids: set[str] = set()
348
+ community_fact_ids: set[str] = set()
265
349
  for kw in keywords:
266
350
  for f in query_facts_by_entity_graph(db_path, kw, max_facts=50):
267
351
  eid = f.get("entity_id", "")
268
352
  if eid:
269
353
  graph_fact_ids.add(eid)
270
354
 
355
+ # Community expansion: follow mentions edges to find related entities
356
+ t0 = time.monotonic()
357
+ try:
358
+ from triplestore import TripleStore
359
+ store = TripleStore(db_path)
360
+
361
+ matched_entities = set()
362
+ for kw in keywords:
363
+ node_id = f"entity:{kw}"
364
+ if store.entity(node_id):
365
+ matched_entities.add(kw)
366
+
367
+ for ent in matched_entities:
368
+ if time.monotonic() - t0 > 0.5:
369
+ break # timing guard
370
+ community = expand_entity_community(store, ent, max_related=3)
371
+ for related_name, _count in community:
372
+ for f in query_facts_by_entity_graph(db_path, related_name, max_facts=20):
373
+ eid = f.get("entity_id", "")
374
+ if eid and eid not in graph_fact_ids:
375
+ community_fact_ids.add(eid)
376
+
377
+ store.close()
378
+ except Exception:
379
+ pass
380
+
271
381
  # Run three retrieval methods independently
272
382
  candidate_limit = max_facts * 3
273
383
  fts_results = query_facts_fts(db_path, query, max_facts=candidate_limit)
@@ -296,11 +406,31 @@ def query_facts_hybrid(
296
406
  for rank, eid in enumerate(ranked_list):
297
407
  rrf_scores[eid] = rrf_scores.get(eid, 0.0) + 1.0 / (K + rank)
298
408
 
409
+ # Co-occurrence boost: use FTS/tag results to find temporally related entities
410
+ import time as _time
411
+ _t_cooccur = _time.monotonic()
412
+ query_matched_ids = {f.get("entity_id", "") for f in fts_results + tag_results if f.get("entity_id")}
413
+ if query_matched_ids and _time.monotonic() - _t_cooccur < 0.3:
414
+ try:
415
+ from triplestore import TripleStore
416
+ _store = TripleStore(db_path)
417
+ cooccur = _cooccurring_entities(_store, query_matched_ids, max_entities=5)
418
+ for ent_name in cooccur:
419
+ for f in query_facts_by_entity_graph(db_path, ent_name, max_facts=10):
420
+ eid = f.get("entity_id", "")
421
+ if eid and eid not in graph_fact_ids:
422
+ community_fact_ids.add(eid)
423
+ _store.close()
424
+ except Exception:
425
+ pass
426
+
299
427
  # Graph boost: facts linked to mentioned entities via backrefs get priority
300
- if graph_fact_ids:
428
+ if graph_fact_ids or community_fact_ids:
301
429
  for eid in rrf_scores:
302
430
  if eid in graph_fact_ids:
303
- rrf_scores[eid] += 0.02 # significant boost — graph-linked facts rank higher
431
+ rrf_scores[eid] += 0.02 # direct graph-linked facts
432
+ elif eid in community_fact_ids:
433
+ rrf_scores[eid] += 0.01 # community-expanded facts (half weight)
304
434
 
305
435
  # Apply confidence decay as secondary signal (fresh facts rank above stale ones)
306
436
  from triplestore import decayed_confidence