topos-node 0.1.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- shared/__init__.py +59 -0
- shared/filtering.py +640 -0
- shared/schema_registry.py +229 -0
- topos/__init__.py +5 -0
- topos/__version__.py +6 -0
- topos/analytics/__init__.py +15 -0
- topos/analytics/duckdb_adapter.py +48 -0
- topos/analytics/messenger_communities.py +349 -0
- topos/analytics/messenger_graph.py +522 -0
- topos/analytics/messenger_labels.py +321 -0
- topos/analytics/profiles.py +22 -0
- topos/analytics/query_engine.py +64 -0
- topos/analytics/raw_queries.py +174 -0
- topos/api/__init__.py +1 -0
- topos/api/analytics.py +52 -0
- topos/api/app_registry.py +31 -0
- topos/api/backup.py +15 -0
- topos/api/compute_remote.py +175 -0
- topos/api/data_commit.py +158 -0
- topos/api/data_explorer_table_prefs.py +81 -0
- topos/api/db.py +10 -0
- topos/api/device.py +25 -0
- topos/api/enrichment.py +959 -0
- topos/api/filter_lab.py +195 -0
- topos/api/health.py +61 -0
- topos/api/ingestion_api.py +37 -0
- topos/api/ingestion_compat.py +21 -0
- topos/api/ingestion_sources.py +600 -0
- topos/api/llm.py +76 -0
- topos/api/local_mcp.py +46 -0
- topos/api/messenger_analytics.py +385 -0
- topos/api/query_api.py +13 -0
- topos/api/sanitization_ollama_config.py +64 -0
- topos/api/source_install.py +324 -0
- topos/api/sources.py +13 -0
- topos/api/sync.py +10 -0
- topos/api/ui_config.py +83 -0
- topos/api/uma_data.py +311 -0
- topos/api/usage.py +49 -0
- topos/api/user_identity.py +46 -0
- topos/app.py +239 -0
- topos/auth.py +17 -0
- topos/canonicalization/__init__.py +1 -0
- topos/canonicalization/mappers/__init__.py +22 -0
- topos/canonicalization/mappers/base.py +26 -0
- topos/canonicalization/mappers/chatgpt_mapper.py +40 -0
- topos/canonicalization/mappers/grok_mapper.py +17 -0
- topos/canonicalization/mappers/messenger_mapper.py +58 -0
- topos/canonicalization/models.py +31 -0
- topos/canonicalization/resolver.py +23 -0
- topos/cli/__init__.py +1 -0
- topos/cli/__main__.py +6 -0
- topos/cli/commands.py +132 -0
- topos/config/__init__.py +1 -0
- topos/config/sanitization_ollama.py +189 -0
- topos/config/settings.py +310 -0
- topos/contacts/__init__.py +5 -0
- topos/contacts/identity.py +24 -0
- topos/control_plane_client.py +300 -0
- topos/core/__init__.py +1 -0
- topos/core/api_models.py +128 -0
- topos/core/connection_resilience.py +99 -0
- topos/core/device_helpers.py +8 -0
- topos/core/errors.py +13 -0
- topos/core/events.py +12 -0
- topos/core/handlers.py +5625 -0
- topos/core/logging.py +175 -0
- topos/core/metrics.py +21 -0
- topos/core/startup_banner.py +62 -0
- topos/core/state.py +682 -0
- topos/core/table_layers.py +45 -0
- topos/core/types.py +13 -0
- topos/data_explorer_table_prefs.py +150 -0
- topos/engine/__init__.py +29 -0
- topos/engine/backends/__init__.py +50 -0
- topos/engine/backends/base.py +21 -0
- topos/engine/backends/huggingface.py +151 -0
- topos/engine/backends/ollama.py +181 -0
- topos/engine/backends/stub.py +22 -0
- topos/engine/engine.py +165 -0
- topos/engine/intake.py +32 -0
- topos/engine/queue_manager.py +112 -0
- topos/engine/registration.py +126 -0
- topos/engine/result_formatter.py +38 -0
- topos/engine/router.py +19 -0
- topos/engine/scoped_token.py +82 -0
- topos/engine/tasks.py +154 -0
- topos/engine/transport.py +44 -0
- topos/engine/usage_guard.py +100 -0
- topos/engine/usage_observation.py +129 -0
- topos/engine/validator.py +23 -0
- topos/enrichment/__init__.py +1 -0
- topos/enrichment/derived_tables.py +214 -0
- topos/enrichment/jobs/__init__.py +30 -0
- topos/enrichment/jobs/base.py +54 -0
- topos/enrichment/jobs/canonical/__init__.py +1 -0
- topos/enrichment/jobs/canonical/embeddings_job.py +27 -0
- topos/enrichment/jobs/canonical/emo_27_job.py +97 -0
- topos/enrichment/jobs/canonical/entities_job.py +27 -0
- topos/enrichment/jobs/canonical/sentiment_job.py +27 -0
- topos/enrichment/jobs/canonical/topics_job.py +27 -0
- topos/enrichment/jobs/raw/__init__.py +1 -0
- topos/enrichment/jobs/raw/attachments_job.py +12 -0
- topos/enrichment/jobs/raw/language_job.py +12 -0
- topos/enrichment/jobs/raw/time_normalization_job.py +12 -0
- topos/enrichment/jobs/raw/tool_calls_job.py +12 -0
- topos/enrichment/models/__init__.py +1 -0
- topos/enrichment/models/manager.py +8 -0
- topos/enrichment/models/registry.py +71 -0
- topos/enrichment/models/versioning.py +8 -0
- topos/enrichment/orchestrator.py +177 -0
- topos/enrichment/processor.py +17 -0
- topos/enrichment/progress_bar.py +122 -0
- topos/enrichment/website_classifier.py +31 -0
- topos/filter_lab/__init__.py +1 -0
- topos/filter_lab/bundles.py +300 -0
- topos/filter_lab/schema.py +86 -0
- topos/filter_lab/service.py +167 -0
- topos/filter_lab/store.py +374 -0
- topos/filter_lab/worker.py +250 -0
- topos/hosted_pool_lease.py +153 -0
- topos/ingestion/__init__.py +1 -0
- topos/ingestion/checkpoints/__init__.py +6 -0
- topos/ingestion/checkpoints/checkpoint_store.py +24 -0
- topos/ingestion/checkpoints/sqlite_checkpoint_store.py +82 -0
- topos/ingestion/ingest_helpers.py +504 -0
- topos/ingestion/jobs.py +91 -0
- topos/ingestion/local_sync.py +823 -0
- topos/ingestion/log_preview.py +21 -0
- topos/ingestion/manager.py +1100 -0
- topos/ingestion/parser.py +174 -0
- topos/ingestion/parsers/__init__.py +32 -0
- topos/ingestion/parsers/base.py +24 -0
- topos/ingestion/parsers/browser_parser.py +171 -0
- topos/ingestion/parsers/calendar_parser.py +21 -0
- topos/ingestion/parsers/chatgpt_conversation_flattener.py +266 -0
- topos/ingestion/parsers/chatgpt_parser.py +67 -0
- topos/ingestion/parsers/grok_parser.py +21 -0
- topos/ingestion/parsers/messenger_parser.py +97 -0
- topos/ingestion/progress.py +54 -0
- topos/ingestion/sources/__init__.py +20 -0
- topos/ingestion/sources/base.py +39 -0
- topos/ingestion/sources/calendar.py +29 -0
- topos/ingestion/sources/chatgpt.py +29 -0
- topos/ingestion/sources/contact_importers.py +274 -0
- topos/ingestion/sources/grok.py +29 -0
- topos/ingestion/sources/imessage_reader.py +479 -0
- topos/ingestion/sources/signal_export_parser.py +132 -0
- topos/ingestion/sources/signal_reader.py +491 -0
- topos/ingestion/state_machine.py +70 -0
- topos/ingestion/triggers/__init__.py +1 -0
- topos/ingestion/triggers/file_trigger.py +36 -0
- topos/ingestion/triggers/sqlite_trigger.py +18 -0
- topos/ingestion/validation/__init__.py +1 -0
- topos/ingestion/validation/base.py +27 -0
- topos/ingestion/validation/schema_registry.py +111 -0
- topos/ingestion/validation/schema_validator.py +13 -0
- topos/lineage/__init__.py +1 -0
- topos/lineage/provenance.py +9 -0
- topos/lineage/tracker.py +9 -0
- topos/mcp_stdio_proxy.py +83 -0
- topos/observability/__init__.py +1 -0
- topos/observability/alerts.py +7 -0
- topos/observability/metrics.py +25 -0
- topos/observability/tracing.py +18 -0
- topos/openai_client.py +69 -0
- topos/projections/__init__.py +1 -0
- topos/projections/vector_index/__init__.py +1 -0
- topos/projections/vector_index/base.py +21 -0
- topos/projections/vector_index/builders.py +11 -0
- topos/projections/vector_index/health_checks.py +5 -0
- topos/rate_limit.py +43 -0
- topos/sanitization/__init__.py +16 -0
- topos/sanitization/ollama_transforms.py +276 -0
- topos/scope_resolution.py +89 -0
- topos/services/__init__.py +1 -0
- topos/services/container.py +46 -0
- topos/services/embeddings/__init__.py +1 -0
- topos/services/embeddings/base.py +7 -0
- topos/services/embeddings/local.py +9 -0
- topos/services/embeddings/remote.py +9 -0
- topos/services/interfaces.py +40 -0
- topos/services/llm/__init__.py +1 -0
- topos/services/llm/base.py +7 -0
- topos/services/llm/openai.py +126 -0
- topos/services/local.py +123 -0
- topos/services/postgres.py +385 -0
- topos/sources/__init__.py +6 -0
- topos/sources/definitions.py +114 -0
- topos/sources/install_service.py +836 -0
- topos/sources/registry.py +263 -0
- topos/sources/runtime_install.py +427 -0
- topos/storage/__init__.py +1 -0
- topos/storage/canonical/__init__.py +18 -0
- topos/storage/canonical/ai_chat/__init__.py +22 -0
- topos/storage/canonical/ai_chat/canonicalizer.py +147 -0
- topos/storage/canonical/ai_chat/mapper.py +168 -0
- topos/storage/canonical/ai_chat/model.py +87 -0
- topos/storage/canonical/ai_chat/tables.py +179 -0
- topos/storage/canonical/canonical_store.py +24 -0
- topos/storage/canonical/conversations_tables.py +1020 -0
- topos/storage/canonical/mapping_store.py +30 -0
- topos/storage/canonical/postgres.py +10 -0
- topos/storage/db/__init__.py +1 -0
- topos/storage/db/client.py +8 -0
- topos/storage/db/migrations/__init__.py +1 -0
- topos/storage/db/migrations/stage9_column_renames.py +78 -0
- topos/storage/db/paths.py +122 -0
- topos/storage/db/postgres.py +240 -0
- topos/storage/db/schema.py +6 -0
- topos/storage/enrichment/__init__.py +1 -0
- topos/storage/enrichment/canonical_enrichment_store.py +7 -0
- topos/storage/enrichment/raw_enrichment_store.py +18 -0
- topos/storage/normalized/__init__.py +1 -0
- topos/storage/normalized/normalized_store.py +24 -0
- topos/storage/oplog/__init__.py +1 -0
- topos/storage/oplog/decision.py +6 -0
- topos/storage/oplog/oplog_store.py +17 -0
- topos/storage/oplog/postgres.py +10 -0
- topos/storage/projections/__init__.py +1 -0
- topos/storage/projections/index_ops_store.py +6 -0
- topos/storage/projections/vector_index_store.py +6 -0
- topos/storage/raw/__init__.py +1 -0
- topos/storage/raw/browser_flat_tables.py +303 -0
- topos/storage/raw/file_store.py +100 -0
- topos/storage/raw/raw_store.py +29 -0
- topos/storage/raw/raw_tables_manager.py +295 -0
- topos/storage/raw/sqlite_raw_store.py +17 -0
- topos/storage/security/encryption.py +21 -0
- topos/storage/signal_identity.py +71 -0
- topos/storage/source_settings.py +116 -0
- topos/storage/user_identity.py +69 -0
- topos/sync/__init__.py +5 -0
- topos/sync/client.py +272 -0
- topos/sync_handlers.py +70 -0
- topos/testing/__init__.py +1 -0
- topos/testing/lifespan.py +7 -0
- topos/uma_contact_enrichment.py +1032 -0
- topos/uma_filters.py +669 -0
- topos/uma_resource_id.py +24 -0
- topos/uma_rpt.py +69 -0
- topos/utils/base_object.py +61 -0
- topos/websocket_client.py +21 -0
- topos_node-0.1.0.dist-info/METADATA +199 -0
- topos_node-0.1.0.dist-info/RECORD +249 -0
- topos_node-0.1.0.dist-info/WHEEL +5 -0
- topos_node-0.1.0.dist-info/entry_points.txt +2 -0
- topos_node-0.1.0.dist-info/licenses/LICENSE +201 -0
- topos_node-0.1.0.dist-info/top_level.txt +2 -0
|
@@ -0,0 +1,522 @@
|
|
|
1
|
+
"""Messenger graph extraction from canonical conversation tables (Sprint 01).
|
|
2
|
+
|
|
3
|
+
Nodes are strictly chat participants from canonical membership/senders.
|
|
4
|
+
Edges combine:
|
|
5
|
+
- co-participation in conversations
|
|
6
|
+
- direct links from reply-to relationships
|
|
7
|
+
- direct links from @mentions in message content (only when mention resolves to a participant)
|
|
8
|
+
"""
|
|
9
|
+
|
|
10
|
+
from __future__ import annotations
|
|
11
|
+
|
|
12
|
+
import json
|
|
13
|
+
import re
|
|
14
|
+
from collections import Counter, defaultdict
|
|
15
|
+
from datetime import datetime
|
|
16
|
+
from itertools import combinations
|
|
17
|
+
from typing import Any, Dict, Iterable, List, Optional, Sequence, Set, Tuple
|
|
18
|
+
|
|
19
|
+
|
|
20
|
+
MENTION_PATTERN = re.compile(r"(?<!\w)@([A-Za-z0-9_.+\-]{2,64})")
|
|
21
|
+
|
|
22
|
+
|
|
23
|
+
SOURCE_DIRECT_LINK_INVESTIGATION: Dict[str, Dict[str, Any]] = {
|
|
24
|
+
"imessage": {
|
|
25
|
+
"reply_fields": [
|
|
26
|
+
"conversation_messages.reply_to_message_id",
|
|
27
|
+
"conversation_messages.metadata_json.thread_originator_guid",
|
|
28
|
+
"conversation_messages.metadata_json.associated_message_guid",
|
|
29
|
+
],
|
|
30
|
+
"mention_fields": [
|
|
31
|
+
"message content @token (regex extraction)",
|
|
32
|
+
],
|
|
33
|
+
"notes": (
|
|
34
|
+
"iMessage ingestion maps thread-originator/associated context into "
|
|
35
|
+
"reply_to_message_id and metadata_json. No structured mention field is "
|
|
36
|
+
"currently ingested; mentions are extracted from content."
|
|
37
|
+
),
|
|
38
|
+
},
|
|
39
|
+
"signal": {
|
|
40
|
+
"reply_fields": [
|
|
41
|
+
"conversation_messages.reply_to_message_id",
|
|
42
|
+
"conversation_messages.metadata_json.quoteId",
|
|
43
|
+
"conversation_messages.metadata_json.quotedMessageId",
|
|
44
|
+
"conversation_messages.metadata_json.replyToMessageId",
|
|
45
|
+
],
|
|
46
|
+
"mention_fields": [
|
|
47
|
+
"message content @token (regex extraction)",
|
|
48
|
+
"metadata_json may include quoteAuthor* fields (used as context only)",
|
|
49
|
+
],
|
|
50
|
+
"notes": (
|
|
51
|
+
"Signal ingestion resolves quote/reply context into reply_to_message_id "
|
|
52
|
+
"when possible. No canonical structured @mention list is currently stored."
|
|
53
|
+
),
|
|
54
|
+
},
|
|
55
|
+
"whatsapp": {
|
|
56
|
+
"reply_fields": [
|
|
57
|
+
"not implemented yet",
|
|
58
|
+
],
|
|
59
|
+
"mention_fields": [
|
|
60
|
+
"not implemented yet",
|
|
61
|
+
],
|
|
62
|
+
"notes": "Reserved for future source integration.",
|
|
63
|
+
},
|
|
64
|
+
}
|
|
65
|
+
|
|
66
|
+
|
|
67
|
+
def _parse_ts(value: str) -> Optional[datetime]:
|
|
68
|
+
if not value:
|
|
69
|
+
return None
|
|
70
|
+
text = str(value).strip()
|
|
71
|
+
if not text:
|
|
72
|
+
return None
|
|
73
|
+
try:
|
|
74
|
+
return datetime.fromisoformat(text.replace("Z", "+00:00"))
|
|
75
|
+
except ValueError:
|
|
76
|
+
return None
|
|
77
|
+
|
|
78
|
+
|
|
79
|
+
def _period_key(ts: str, granularity: str) -> str:
|
|
80
|
+
dt = _parse_ts(ts)
|
|
81
|
+
if dt is None:
|
|
82
|
+
return "unknown"
|
|
83
|
+
if granularity == "quarter":
|
|
84
|
+
quarter = ((dt.month - 1) // 3) + 1
|
|
85
|
+
return f"{dt.year}-Q{quarter}"
|
|
86
|
+
if granularity == "year":
|
|
87
|
+
return f"{dt.year}"
|
|
88
|
+
return f"{dt.year:04d}-{dt.month:02d}"
|
|
89
|
+
|
|
90
|
+
|
|
91
|
+
def _extract_mentions(content: Optional[str]) -> Set[str]:
|
|
92
|
+
if not content:
|
|
93
|
+
return set()
|
|
94
|
+
return {m.group(1).lower() for m in MENTION_PATTERN.finditer(content)}
|
|
95
|
+
|
|
96
|
+
|
|
97
|
+
def _normalize_source_ids(source_ids: Optional[Sequence[str]]) -> Optional[List[str]]:
|
|
98
|
+
if not source_ids:
|
|
99
|
+
return None
|
|
100
|
+
norm = sorted({str(s).strip() for s in source_ids if str(s).strip()})
|
|
101
|
+
return norm or None
|
|
102
|
+
|
|
103
|
+
|
|
104
|
+
def _sql_in_clause(values: Sequence[str]) -> Tuple[str, List[str]]:
|
|
105
|
+
placeholders = ",".join(["?"] * len(values))
|
|
106
|
+
return f"({placeholders})", list(values)
|
|
107
|
+
|
|
108
|
+
|
|
109
|
+
def _rows_to_dicts(cursor_rows: Iterable[Any]) -> List[Dict[str, Any]]:
|
|
110
|
+
out: List[Dict[str, Any]] = []
|
|
111
|
+
for row in cursor_rows:
|
|
112
|
+
if isinstance(row, dict):
|
|
113
|
+
out.append(dict(row))
|
|
114
|
+
continue
|
|
115
|
+
if hasattr(row, "keys"):
|
|
116
|
+
out.append({k: row[k] for k in row.keys()})
|
|
117
|
+
continue
|
|
118
|
+
raise TypeError("Expected sqlite Row/dict rows; ensure connection.row_factory is set")
|
|
119
|
+
return out
|
|
120
|
+
|
|
121
|
+
|
|
122
|
+
def _load_contact_profiles(
|
|
123
|
+
conn: Any,
|
|
124
|
+
*,
|
|
125
|
+
dataset_id: str,
|
|
126
|
+
) -> Dict[str, Dict[str, Any]]:
|
|
127
|
+
rows = conn.execute(
|
|
128
|
+
"""
|
|
129
|
+
SELECT contact_id, display_name, known_usernames_json
|
|
130
|
+
FROM contacts
|
|
131
|
+
WHERE dataset_id = ?
|
|
132
|
+
""",
|
|
133
|
+
(dataset_id,),
|
|
134
|
+
).fetchall()
|
|
135
|
+
profiles: Dict[str, Dict[str, Any]] = {}
|
|
136
|
+
for row in _rows_to_dicts(rows):
|
|
137
|
+
known_usernames_raw = row.get("known_usernames_json")
|
|
138
|
+
known_usernames: List[str] = []
|
|
139
|
+
if isinstance(known_usernames_raw, str) and known_usernames_raw.strip():
|
|
140
|
+
try:
|
|
141
|
+
parsed = json.loads(known_usernames_raw)
|
|
142
|
+
if isinstance(parsed, list):
|
|
143
|
+
known_usernames = [str(v).strip() for v in parsed if str(v).strip()]
|
|
144
|
+
except Exception:
|
|
145
|
+
known_usernames = []
|
|
146
|
+
profiles[str(row["contact_id"])] = {
|
|
147
|
+
"display_name": row.get("display_name"),
|
|
148
|
+
"known_usernames": known_usernames,
|
|
149
|
+
}
|
|
150
|
+
return profiles
|
|
151
|
+
|
|
152
|
+
|
|
153
|
+
def _load_contact_identifiers(
|
|
154
|
+
conn: Any,
|
|
155
|
+
*,
|
|
156
|
+
dataset_id: str,
|
|
157
|
+
source_ids: Optional[Sequence[str]],
|
|
158
|
+
) -> Dict[str, Set[str]]:
|
|
159
|
+
params: List[Any] = [dataset_id]
|
|
160
|
+
where = "dataset_id = ?"
|
|
161
|
+
if source_ids:
|
|
162
|
+
in_clause, in_params = _sql_in_clause(source_ids)
|
|
163
|
+
where = f"({where} AND source_id IN {in_clause}) OR (dataset_id = ? AND source_id = '*')"
|
|
164
|
+
params.extend(in_params)
|
|
165
|
+
params.append(dataset_id)
|
|
166
|
+
rows = conn.execute(
|
|
167
|
+
f"""
|
|
168
|
+
SELECT contact_id, identifier
|
|
169
|
+
FROM contact_identifiers
|
|
170
|
+
WHERE {where}
|
|
171
|
+
""",
|
|
172
|
+
tuple(params),
|
|
173
|
+
).fetchall()
|
|
174
|
+
aliases: Dict[str, Set[str]] = defaultdict(set)
|
|
175
|
+
for row in _rows_to_dicts(rows):
|
|
176
|
+
contact_id = str(row.get("contact_id") or "").strip()
|
|
177
|
+
identifier = str(row.get("identifier") or "").strip()
|
|
178
|
+
if contact_id and identifier:
|
|
179
|
+
aliases[contact_id].add(identifier.lower())
|
|
180
|
+
return aliases
|
|
181
|
+
|
|
182
|
+
|
|
183
|
+
def _participant_aliases_for_contacts(
|
|
184
|
+
*,
|
|
185
|
+
contact_ids: Iterable[str],
|
|
186
|
+
contact_profiles: Dict[str, Dict[str, Any]],
|
|
187
|
+
contact_identifiers: Dict[str, Set[str]],
|
|
188
|
+
) -> Dict[str, Set[str]]:
|
|
189
|
+
aliases_by_contact: Dict[str, Set[str]] = {}
|
|
190
|
+
for contact_id in contact_ids:
|
|
191
|
+
aliases: Set[str] = set()
|
|
192
|
+
profile = contact_profiles.get(contact_id) or {}
|
|
193
|
+
display_name = str(profile.get("display_name") or "").strip()
|
|
194
|
+
if display_name:
|
|
195
|
+
aliases.add(display_name.lower())
|
|
196
|
+
aliases.add(display_name.replace(" ", "").lower())
|
|
197
|
+
for username in profile.get("known_usernames", []) or []:
|
|
198
|
+
uname = str(username).strip().lower()
|
|
199
|
+
if uname:
|
|
200
|
+
aliases.add(uname)
|
|
201
|
+
for identifier in contact_identifiers.get(contact_id, set()):
|
|
202
|
+
aliases.add(identifier.lower())
|
|
203
|
+
aliases_by_contact[contact_id] = {a for a in aliases if a}
|
|
204
|
+
return aliases_by_contact
|
|
205
|
+
|
|
206
|
+
|
|
207
|
+
def _load_contact_id_lookup_by_identifier(
|
|
208
|
+
conn: Any,
|
|
209
|
+
*,
|
|
210
|
+
dataset_id: str,
|
|
211
|
+
source_ids: Optional[Sequence[str]],
|
|
212
|
+
) -> Dict[Tuple[str, str], str]:
|
|
213
|
+
params: List[Any] = [dataset_id]
|
|
214
|
+
where = ["dataset_id = ?"]
|
|
215
|
+
if source_ids:
|
|
216
|
+
in_clause, in_params = _sql_in_clause(source_ids)
|
|
217
|
+
where.append(f"(source_id IN {in_clause} OR source_id = '*')")
|
|
218
|
+
params.extend(in_params)
|
|
219
|
+
rows = _rows_to_dicts(
|
|
220
|
+
conn.execute(
|
|
221
|
+
f"""
|
|
222
|
+
SELECT source_id, identifier, contact_id
|
|
223
|
+
FROM contact_identifiers
|
|
224
|
+
WHERE {" AND ".join(where)}
|
|
225
|
+
""",
|
|
226
|
+
tuple(params),
|
|
227
|
+
).fetchall()
|
|
228
|
+
)
|
|
229
|
+
lookup: Dict[Tuple[str, str], str] = {}
|
|
230
|
+
for row in rows:
|
|
231
|
+
src = str(row.get("source_id") or "").strip()
|
|
232
|
+
identifier = str(row.get("identifier") or "").strip()
|
|
233
|
+
contact_id = str(row.get("contact_id") or "").strip()
|
|
234
|
+
if src and identifier and contact_id:
|
|
235
|
+
lookup[(src, identifier)] = contact_id
|
|
236
|
+
return lookup
|
|
237
|
+
|
|
238
|
+
|
|
239
|
+
def _build_unique_alias_lookup(aliases_by_contact: Dict[str, Set[str]]) -> Dict[str, str]:
|
|
240
|
+
collisions: Dict[str, Set[str]] = defaultdict(set)
|
|
241
|
+
for contact_id, aliases in aliases_by_contact.items():
|
|
242
|
+
for alias in aliases:
|
|
243
|
+
collisions[alias].add(contact_id)
|
|
244
|
+
return {
|
|
245
|
+
alias: next(iter(contact_ids))
|
|
246
|
+
for alias, contact_ids in collisions.items()
|
|
247
|
+
if len(contact_ids) == 1
|
|
248
|
+
}
|
|
249
|
+
|
|
250
|
+
|
|
251
|
+
def extract_messenger_graph(
|
|
252
|
+
*,
|
|
253
|
+
dataset_id: str,
|
|
254
|
+
conn: Optional[Any] = None,
|
|
255
|
+
start_ts: Optional[str] = None,
|
|
256
|
+
end_ts: Optional[str] = None,
|
|
257
|
+
source_ids: Optional[Sequence[str]] = None,
|
|
258
|
+
period_granularity: str = "month",
|
|
259
|
+
cumulative: bool = False,
|
|
260
|
+
) -> Dict[str, Any]:
|
|
261
|
+
"""Extract messenger graph nodes/edges per period from canonical tables.
|
|
262
|
+
|
|
263
|
+
Returns:
|
|
264
|
+
{
|
|
265
|
+
"period_granularity": "month|quarter|year",
|
|
266
|
+
"source_ids": [...],
|
|
267
|
+
"periods": [
|
|
268
|
+
{
|
|
269
|
+
"period_key": "YYYY-MM",
|
|
270
|
+
"nodes": [{"id", "label", "source_ids"}],
|
|
271
|
+
"edges": [{"source","target","weight","edge_type","edge_type_counts"}],
|
|
272
|
+
},
|
|
273
|
+
],
|
|
274
|
+
"investigation": SOURCE_DIRECT_LINK_INVESTIGATION,
|
|
275
|
+
}
|
|
276
|
+
"""
|
|
277
|
+
if not dataset_id:
|
|
278
|
+
raise ValueError("dataset_id is required")
|
|
279
|
+
if period_granularity not in {"month", "quarter", "year"}:
|
|
280
|
+
raise ValueError("period_granularity must be one of: month, quarter, year")
|
|
281
|
+
|
|
282
|
+
if conn is not None:
|
|
283
|
+
db = conn
|
|
284
|
+
else:
|
|
285
|
+
from ..core.state import get_db_connection
|
|
286
|
+
|
|
287
|
+
db = get_db_connection()
|
|
288
|
+
if db is None:
|
|
289
|
+
raise RuntimeError("Database connection not available")
|
|
290
|
+
|
|
291
|
+
normalized_sources = _normalize_source_ids(source_ids)
|
|
292
|
+
query_params: List[Any] = [dataset_id]
|
|
293
|
+
where = ["m.dataset_id = ?"]
|
|
294
|
+
if start_ts:
|
|
295
|
+
where.append("m.event_at >= ?")
|
|
296
|
+
query_params.append(start_ts)
|
|
297
|
+
if end_ts:
|
|
298
|
+
where.append("m.event_at <= ?")
|
|
299
|
+
query_params.append(end_ts)
|
|
300
|
+
if normalized_sources:
|
|
301
|
+
in_clause, in_params = _sql_in_clause(normalized_sources)
|
|
302
|
+
where.append(f"m.source_id IN {in_clause}")
|
|
303
|
+
query_params.extend(in_params)
|
|
304
|
+
|
|
305
|
+
rows = db.execute(
|
|
306
|
+
f"""
|
|
307
|
+
SELECT
|
|
308
|
+
m.message_id,
|
|
309
|
+
m.conversation_id,
|
|
310
|
+
m.source_id,
|
|
311
|
+
m.sender_id,
|
|
312
|
+
m.reply_to_message_id,
|
|
313
|
+
m.content,
|
|
314
|
+
m.event_at
|
|
315
|
+
FROM conversation_messages m
|
|
316
|
+
WHERE {" AND ".join(where)}
|
|
317
|
+
ORDER BY m.event_at ASC, m.message_id ASC
|
|
318
|
+
""",
|
|
319
|
+
tuple(query_params),
|
|
320
|
+
).fetchall()
|
|
321
|
+
message_rows = _rows_to_dicts(rows)
|
|
322
|
+
if not message_rows:
|
|
323
|
+
return {
|
|
324
|
+
"period_granularity": period_granularity,
|
|
325
|
+
"source_ids": normalized_sources or [],
|
|
326
|
+
"periods": [],
|
|
327
|
+
"investigation": SOURCE_DIRECT_LINK_INVESTIGATION,
|
|
328
|
+
}
|
|
329
|
+
|
|
330
|
+
contact_lookup = _load_contact_id_lookup_by_identifier(
|
|
331
|
+
db,
|
|
332
|
+
dataset_id=dataset_id,
|
|
333
|
+
source_ids=normalized_sources,
|
|
334
|
+
)
|
|
335
|
+
|
|
336
|
+
messages_by_period: Dict[str, List[Dict[str, Any]]] = defaultdict(list)
|
|
337
|
+
conversation_keys_by_period: Dict[str, Set[Tuple[str, str]]] = defaultdict(set)
|
|
338
|
+
global_message_sender: Dict[str, str] = {}
|
|
339
|
+
|
|
340
|
+
for row in message_rows:
|
|
341
|
+
sender_id = str(row.get("sender_id") or "").strip()
|
|
342
|
+
source_id = str(row.get("source_id") or "").strip()
|
|
343
|
+
sender_contact_id = ""
|
|
344
|
+
if sender_id and source_id:
|
|
345
|
+
sender_contact_id = contact_lookup.get((source_id, sender_id), "") or contact_lookup.get(("*", sender_id), "")
|
|
346
|
+
row["sender_contact_id"] = sender_contact_id
|
|
347
|
+
|
|
348
|
+
period_key = _period_key(str(row.get("event_at") or ""), period_granularity)
|
|
349
|
+
messages_by_period[period_key].append(row)
|
|
350
|
+
conv_key = (str(row.get("conversation_id") or ""), str(row.get("source_id") or ""))
|
|
351
|
+
conversation_keys_by_period[period_key].add(conv_key)
|
|
352
|
+
message_id = str(row.get("message_id") or "")
|
|
353
|
+
sender_contact_id = str(row.get("sender_contact_id") or "")
|
|
354
|
+
if message_id and sender_contact_id:
|
|
355
|
+
global_message_sender[message_id] = sender_contact_id
|
|
356
|
+
|
|
357
|
+
cp_params: List[Any] = [dataset_id]
|
|
358
|
+
cp_where = ["dataset_id = ?"]
|
|
359
|
+
if normalized_sources:
|
|
360
|
+
in_clause, in_params = _sql_in_clause(normalized_sources)
|
|
361
|
+
cp_where.append(f"source_id IN {in_clause}")
|
|
362
|
+
cp_params.extend(in_params)
|
|
363
|
+
participant_rows = _rows_to_dicts(
|
|
364
|
+
db.execute(
|
|
365
|
+
f"""
|
|
366
|
+
SELECT conversation_id, source_id, contact_id
|
|
367
|
+
FROM conversation_participants
|
|
368
|
+
WHERE {" AND ".join(cp_where)}
|
|
369
|
+
""",
|
|
370
|
+
tuple(cp_params),
|
|
371
|
+
).fetchall()
|
|
372
|
+
)
|
|
373
|
+
|
|
374
|
+
participants_by_conversation: Dict[Tuple[str, str], Set[str]] = defaultdict(set)
|
|
375
|
+
for row in participant_rows:
|
|
376
|
+
conv_key = (str(row.get("conversation_id") or ""), str(row.get("source_id") or ""))
|
|
377
|
+
contact_id = str(row.get("contact_id") or "")
|
|
378
|
+
if contact_id:
|
|
379
|
+
participants_by_conversation[conv_key].add(contact_id)
|
|
380
|
+
|
|
381
|
+
contact_profiles = _load_contact_profiles(db, dataset_id=dataset_id)
|
|
382
|
+
contact_identifiers = _load_contact_identifiers(
|
|
383
|
+
db,
|
|
384
|
+
dataset_id=dataset_id,
|
|
385
|
+
source_ids=normalized_sources,
|
|
386
|
+
)
|
|
387
|
+
|
|
388
|
+
ordered_periods = sorted(messages_by_period.keys())
|
|
389
|
+
period_payloads: List[Dict[str, Any]] = []
|
|
390
|
+
cumulative_messages: List[Dict[str, Any]] = []
|
|
391
|
+
cumulative_conv_keys: Set[Tuple[str, str]] = set()
|
|
392
|
+
|
|
393
|
+
for period_key in ordered_periods:
|
|
394
|
+
current_messages = messages_by_period[period_key]
|
|
395
|
+
current_conv_keys = conversation_keys_by_period[period_key]
|
|
396
|
+
if cumulative:
|
|
397
|
+
cumulative_messages.extend(current_messages)
|
|
398
|
+
cumulative_conv_keys |= current_conv_keys
|
|
399
|
+
period_messages = cumulative_messages
|
|
400
|
+
period_conv_keys = cumulative_conv_keys
|
|
401
|
+
else:
|
|
402
|
+
period_messages = current_messages
|
|
403
|
+
period_conv_keys = current_conv_keys
|
|
404
|
+
|
|
405
|
+
period_participants_by_conv: Dict[Tuple[str, str], Set[str]] = {}
|
|
406
|
+
for conv_key in period_conv_keys:
|
|
407
|
+
base = set(participants_by_conversation.get(conv_key, set()))
|
|
408
|
+
period_participants_by_conv[conv_key] = base
|
|
409
|
+
|
|
410
|
+
participant_ids: Set[str] = set()
|
|
411
|
+
node_sources: Dict[str, Set[str]] = defaultdict(set)
|
|
412
|
+
|
|
413
|
+
for conv_key, contact_ids in period_participants_by_conv.items():
|
|
414
|
+
src = conv_key[1]
|
|
415
|
+
for contact_id in contact_ids:
|
|
416
|
+
participant_ids.add(contact_id)
|
|
417
|
+
if src:
|
|
418
|
+
node_sources[contact_id].add(src)
|
|
419
|
+
|
|
420
|
+
for msg in period_messages:
|
|
421
|
+
contact_id = str(msg.get("sender_contact_id") or "").strip()
|
|
422
|
+
conv_key = (str(msg.get("conversation_id") or ""), str(msg.get("source_id") or ""))
|
|
423
|
+
if not contact_id:
|
|
424
|
+
continue
|
|
425
|
+
participant_ids.add(contact_id)
|
|
426
|
+
if conv_key[1]:
|
|
427
|
+
node_sources[contact_id].add(conv_key[1])
|
|
428
|
+
period_participants_by_conv.setdefault(conv_key, set()).add(contact_id)
|
|
429
|
+
|
|
430
|
+
aliases_by_contact = _participant_aliases_for_contacts(
|
|
431
|
+
contact_ids=participant_ids,
|
|
432
|
+
contact_profiles=contact_profiles,
|
|
433
|
+
contact_identifiers=contact_identifiers,
|
|
434
|
+
)
|
|
435
|
+
|
|
436
|
+
co_edges: Counter[Tuple[str, str]] = Counter()
|
|
437
|
+
reply_edges: Counter[Tuple[str, str]] = Counter()
|
|
438
|
+
mention_edges: Counter[Tuple[str, str]] = Counter()
|
|
439
|
+
|
|
440
|
+
for conv_key, members in period_participants_by_conv.items():
|
|
441
|
+
sorted_members = sorted(members)
|
|
442
|
+
for src_id, tgt_id in combinations(sorted_members, 2):
|
|
443
|
+
if src_id and tgt_id:
|
|
444
|
+
co_edges[(src_id, tgt_id)] += 1
|
|
445
|
+
|
|
446
|
+
conv_alias_lookup = _build_unique_alias_lookup(
|
|
447
|
+
{cid: aliases_by_contact.get(cid, set()) for cid in members}
|
|
448
|
+
)
|
|
449
|
+
conv_messages = [
|
|
450
|
+
m
|
|
451
|
+
for m in period_messages
|
|
452
|
+
if (str(m.get("conversation_id") or ""), str(m.get("source_id") or "")) == conv_key
|
|
453
|
+
]
|
|
454
|
+
for msg in conv_messages:
|
|
455
|
+
sender_id = str(msg.get("sender_contact_id") or "").strip()
|
|
456
|
+
if not sender_id:
|
|
457
|
+
continue
|
|
458
|
+
|
|
459
|
+
reply_to_message_id = str(msg.get("reply_to_message_id") or "").strip()
|
|
460
|
+
if reply_to_message_id:
|
|
461
|
+
target_id = global_message_sender.get(reply_to_message_id)
|
|
462
|
+
if target_id and target_id in members and target_id != sender_id:
|
|
463
|
+
edge = tuple(sorted((sender_id, target_id)))
|
|
464
|
+
reply_edges[edge] += 1
|
|
465
|
+
|
|
466
|
+
for mention in _extract_mentions(msg.get("content")):
|
|
467
|
+
target_id = conv_alias_lookup.get(mention)
|
|
468
|
+
if target_id and target_id != sender_id:
|
|
469
|
+
edge = tuple(sorted((sender_id, target_id)))
|
|
470
|
+
mention_edges[edge] += 1
|
|
471
|
+
|
|
472
|
+
all_edges = set(co_edges.keys()) | set(reply_edges.keys()) | set(mention_edges.keys())
|
|
473
|
+
edges_payload: List[Dict[str, Any]] = []
|
|
474
|
+
for src_id, tgt_id in sorted(all_edges):
|
|
475
|
+
edge_type_counts: Dict[str, int] = {}
|
|
476
|
+
if co_edges.get((src_id, tgt_id), 0):
|
|
477
|
+
edge_type_counts["co_participation"] = int(co_edges[(src_id, tgt_id)])
|
|
478
|
+
if reply_edges.get((src_id, tgt_id), 0):
|
|
479
|
+
edge_type_counts["direct_reply"] = int(reply_edges[(src_id, tgt_id)])
|
|
480
|
+
if mention_edges.get((src_id, tgt_id), 0):
|
|
481
|
+
edge_type_counts["direct_mention"] = int(mention_edges[(src_id, tgt_id)])
|
|
482
|
+
total_weight = sum(edge_type_counts.values())
|
|
483
|
+
if len(edge_type_counts) == 1:
|
|
484
|
+
edge_type = next(iter(edge_type_counts.keys()))
|
|
485
|
+
else:
|
|
486
|
+
edge_type = "mixed"
|
|
487
|
+
edges_payload.append(
|
|
488
|
+
{
|
|
489
|
+
"source": src_id,
|
|
490
|
+
"target": tgt_id,
|
|
491
|
+
"weight": total_weight,
|
|
492
|
+
"edge_type": edge_type,
|
|
493
|
+
"edge_type_counts": edge_type_counts,
|
|
494
|
+
}
|
|
495
|
+
)
|
|
496
|
+
|
|
497
|
+
nodes_payload: List[Dict[str, Any]] = []
|
|
498
|
+
for contact_id in sorted(participant_ids):
|
|
499
|
+
profile = contact_profiles.get(contact_id) or {}
|
|
500
|
+
label = str(profile.get("display_name") or "").strip() or contact_id
|
|
501
|
+
nodes_payload.append(
|
|
502
|
+
{
|
|
503
|
+
"id": contact_id,
|
|
504
|
+
"label": label,
|
|
505
|
+
"source_ids": sorted(node_sources.get(contact_id, set())),
|
|
506
|
+
}
|
|
507
|
+
)
|
|
508
|
+
|
|
509
|
+
period_payloads.append(
|
|
510
|
+
{
|
|
511
|
+
"period_key": period_key,
|
|
512
|
+
"nodes": nodes_payload,
|
|
513
|
+
"edges": edges_payload,
|
|
514
|
+
}
|
|
515
|
+
)
|
|
516
|
+
|
|
517
|
+
return {
|
|
518
|
+
"period_granularity": period_granularity,
|
|
519
|
+
"source_ids": normalized_sources or [],
|
|
520
|
+
"periods": period_payloads,
|
|
521
|
+
"investigation": SOURCE_DIRECT_LINK_INVESTIGATION,
|
|
522
|
+
}
|