topos-node 0.1.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- shared/__init__.py +59 -0
- shared/filtering.py +640 -0
- shared/schema_registry.py +229 -0
- topos/__init__.py +5 -0
- topos/__version__.py +6 -0
- topos/analytics/__init__.py +15 -0
- topos/analytics/duckdb_adapter.py +48 -0
- topos/analytics/messenger_communities.py +349 -0
- topos/analytics/messenger_graph.py +522 -0
- topos/analytics/messenger_labels.py +321 -0
- topos/analytics/profiles.py +22 -0
- topos/analytics/query_engine.py +64 -0
- topos/analytics/raw_queries.py +174 -0
- topos/api/__init__.py +1 -0
- topos/api/analytics.py +52 -0
- topos/api/app_registry.py +31 -0
- topos/api/backup.py +15 -0
- topos/api/compute_remote.py +175 -0
- topos/api/data_commit.py +158 -0
- topos/api/data_explorer_table_prefs.py +81 -0
- topos/api/db.py +10 -0
- topos/api/device.py +25 -0
- topos/api/enrichment.py +959 -0
- topos/api/filter_lab.py +195 -0
- topos/api/health.py +61 -0
- topos/api/ingestion_api.py +37 -0
- topos/api/ingestion_compat.py +21 -0
- topos/api/ingestion_sources.py +600 -0
- topos/api/llm.py +76 -0
- topos/api/local_mcp.py +46 -0
- topos/api/messenger_analytics.py +385 -0
- topos/api/query_api.py +13 -0
- topos/api/sanitization_ollama_config.py +64 -0
- topos/api/source_install.py +324 -0
- topos/api/sources.py +13 -0
- topos/api/sync.py +10 -0
- topos/api/ui_config.py +83 -0
- topos/api/uma_data.py +311 -0
- topos/api/usage.py +49 -0
- topos/api/user_identity.py +46 -0
- topos/app.py +239 -0
- topos/auth.py +17 -0
- topos/canonicalization/__init__.py +1 -0
- topos/canonicalization/mappers/__init__.py +22 -0
- topos/canonicalization/mappers/base.py +26 -0
- topos/canonicalization/mappers/chatgpt_mapper.py +40 -0
- topos/canonicalization/mappers/grok_mapper.py +17 -0
- topos/canonicalization/mappers/messenger_mapper.py +58 -0
- topos/canonicalization/models.py +31 -0
- topos/canonicalization/resolver.py +23 -0
- topos/cli/__init__.py +1 -0
- topos/cli/__main__.py +6 -0
- topos/cli/commands.py +132 -0
- topos/config/__init__.py +1 -0
- topos/config/sanitization_ollama.py +189 -0
- topos/config/settings.py +310 -0
- topos/contacts/__init__.py +5 -0
- topos/contacts/identity.py +24 -0
- topos/control_plane_client.py +300 -0
- topos/core/__init__.py +1 -0
- topos/core/api_models.py +128 -0
- topos/core/connection_resilience.py +99 -0
- topos/core/device_helpers.py +8 -0
- topos/core/errors.py +13 -0
- topos/core/events.py +12 -0
- topos/core/handlers.py +5625 -0
- topos/core/logging.py +175 -0
- topos/core/metrics.py +21 -0
- topos/core/startup_banner.py +62 -0
- topos/core/state.py +682 -0
- topos/core/table_layers.py +45 -0
- topos/core/types.py +13 -0
- topos/data_explorer_table_prefs.py +150 -0
- topos/engine/__init__.py +29 -0
- topos/engine/backends/__init__.py +50 -0
- topos/engine/backends/base.py +21 -0
- topos/engine/backends/huggingface.py +151 -0
- topos/engine/backends/ollama.py +181 -0
- topos/engine/backends/stub.py +22 -0
- topos/engine/engine.py +165 -0
- topos/engine/intake.py +32 -0
- topos/engine/queue_manager.py +112 -0
- topos/engine/registration.py +126 -0
- topos/engine/result_formatter.py +38 -0
- topos/engine/router.py +19 -0
- topos/engine/scoped_token.py +82 -0
- topos/engine/tasks.py +154 -0
- topos/engine/transport.py +44 -0
- topos/engine/usage_guard.py +100 -0
- topos/engine/usage_observation.py +129 -0
- topos/engine/validator.py +23 -0
- topos/enrichment/__init__.py +1 -0
- topos/enrichment/derived_tables.py +214 -0
- topos/enrichment/jobs/__init__.py +30 -0
- topos/enrichment/jobs/base.py +54 -0
- topos/enrichment/jobs/canonical/__init__.py +1 -0
- topos/enrichment/jobs/canonical/embeddings_job.py +27 -0
- topos/enrichment/jobs/canonical/emo_27_job.py +97 -0
- topos/enrichment/jobs/canonical/entities_job.py +27 -0
- topos/enrichment/jobs/canonical/sentiment_job.py +27 -0
- topos/enrichment/jobs/canonical/topics_job.py +27 -0
- topos/enrichment/jobs/raw/__init__.py +1 -0
- topos/enrichment/jobs/raw/attachments_job.py +12 -0
- topos/enrichment/jobs/raw/language_job.py +12 -0
- topos/enrichment/jobs/raw/time_normalization_job.py +12 -0
- topos/enrichment/jobs/raw/tool_calls_job.py +12 -0
- topos/enrichment/models/__init__.py +1 -0
- topos/enrichment/models/manager.py +8 -0
- topos/enrichment/models/registry.py +71 -0
- topos/enrichment/models/versioning.py +8 -0
- topos/enrichment/orchestrator.py +177 -0
- topos/enrichment/processor.py +17 -0
- topos/enrichment/progress_bar.py +122 -0
- topos/enrichment/website_classifier.py +31 -0
- topos/filter_lab/__init__.py +1 -0
- topos/filter_lab/bundles.py +300 -0
- topos/filter_lab/schema.py +86 -0
- topos/filter_lab/service.py +167 -0
- topos/filter_lab/store.py +374 -0
- topos/filter_lab/worker.py +250 -0
- topos/hosted_pool_lease.py +153 -0
- topos/ingestion/__init__.py +1 -0
- topos/ingestion/checkpoints/__init__.py +6 -0
- topos/ingestion/checkpoints/checkpoint_store.py +24 -0
- topos/ingestion/checkpoints/sqlite_checkpoint_store.py +82 -0
- topos/ingestion/ingest_helpers.py +504 -0
- topos/ingestion/jobs.py +91 -0
- topos/ingestion/local_sync.py +823 -0
- topos/ingestion/log_preview.py +21 -0
- topos/ingestion/manager.py +1100 -0
- topos/ingestion/parser.py +174 -0
- topos/ingestion/parsers/__init__.py +32 -0
- topos/ingestion/parsers/base.py +24 -0
- topos/ingestion/parsers/browser_parser.py +171 -0
- topos/ingestion/parsers/calendar_parser.py +21 -0
- topos/ingestion/parsers/chatgpt_conversation_flattener.py +266 -0
- topos/ingestion/parsers/chatgpt_parser.py +67 -0
- topos/ingestion/parsers/grok_parser.py +21 -0
- topos/ingestion/parsers/messenger_parser.py +97 -0
- topos/ingestion/progress.py +54 -0
- topos/ingestion/sources/__init__.py +20 -0
- topos/ingestion/sources/base.py +39 -0
- topos/ingestion/sources/calendar.py +29 -0
- topos/ingestion/sources/chatgpt.py +29 -0
- topos/ingestion/sources/contact_importers.py +274 -0
- topos/ingestion/sources/grok.py +29 -0
- topos/ingestion/sources/imessage_reader.py +479 -0
- topos/ingestion/sources/signal_export_parser.py +132 -0
- topos/ingestion/sources/signal_reader.py +491 -0
- topos/ingestion/state_machine.py +70 -0
- topos/ingestion/triggers/__init__.py +1 -0
- topos/ingestion/triggers/file_trigger.py +36 -0
- topos/ingestion/triggers/sqlite_trigger.py +18 -0
- topos/ingestion/validation/__init__.py +1 -0
- topos/ingestion/validation/base.py +27 -0
- topos/ingestion/validation/schema_registry.py +111 -0
- topos/ingestion/validation/schema_validator.py +13 -0
- topos/lineage/__init__.py +1 -0
- topos/lineage/provenance.py +9 -0
- topos/lineage/tracker.py +9 -0
- topos/mcp_stdio_proxy.py +83 -0
- topos/observability/__init__.py +1 -0
- topos/observability/alerts.py +7 -0
- topos/observability/metrics.py +25 -0
- topos/observability/tracing.py +18 -0
- topos/openai_client.py +69 -0
- topos/projections/__init__.py +1 -0
- topos/projections/vector_index/__init__.py +1 -0
- topos/projections/vector_index/base.py +21 -0
- topos/projections/vector_index/builders.py +11 -0
- topos/projections/vector_index/health_checks.py +5 -0
- topos/rate_limit.py +43 -0
- topos/sanitization/__init__.py +16 -0
- topos/sanitization/ollama_transforms.py +276 -0
- topos/scope_resolution.py +89 -0
- topos/services/__init__.py +1 -0
- topos/services/container.py +46 -0
- topos/services/embeddings/__init__.py +1 -0
- topos/services/embeddings/base.py +7 -0
- topos/services/embeddings/local.py +9 -0
- topos/services/embeddings/remote.py +9 -0
- topos/services/interfaces.py +40 -0
- topos/services/llm/__init__.py +1 -0
- topos/services/llm/base.py +7 -0
- topos/services/llm/openai.py +126 -0
- topos/services/local.py +123 -0
- topos/services/postgres.py +385 -0
- topos/sources/__init__.py +6 -0
- topos/sources/definitions.py +114 -0
- topos/sources/install_service.py +836 -0
- topos/sources/registry.py +263 -0
- topos/sources/runtime_install.py +427 -0
- topos/storage/__init__.py +1 -0
- topos/storage/canonical/__init__.py +18 -0
- topos/storage/canonical/ai_chat/__init__.py +22 -0
- topos/storage/canonical/ai_chat/canonicalizer.py +147 -0
- topos/storage/canonical/ai_chat/mapper.py +168 -0
- topos/storage/canonical/ai_chat/model.py +87 -0
- topos/storage/canonical/ai_chat/tables.py +179 -0
- topos/storage/canonical/canonical_store.py +24 -0
- topos/storage/canonical/conversations_tables.py +1020 -0
- topos/storage/canonical/mapping_store.py +30 -0
- topos/storage/canonical/postgres.py +10 -0
- topos/storage/db/__init__.py +1 -0
- topos/storage/db/client.py +8 -0
- topos/storage/db/migrations/__init__.py +1 -0
- topos/storage/db/migrations/stage9_column_renames.py +78 -0
- topos/storage/db/paths.py +122 -0
- topos/storage/db/postgres.py +240 -0
- topos/storage/db/schema.py +6 -0
- topos/storage/enrichment/__init__.py +1 -0
- topos/storage/enrichment/canonical_enrichment_store.py +7 -0
- topos/storage/enrichment/raw_enrichment_store.py +18 -0
- topos/storage/normalized/__init__.py +1 -0
- topos/storage/normalized/normalized_store.py +24 -0
- topos/storage/oplog/__init__.py +1 -0
- topos/storage/oplog/decision.py +6 -0
- topos/storage/oplog/oplog_store.py +17 -0
- topos/storage/oplog/postgres.py +10 -0
- topos/storage/projections/__init__.py +1 -0
- topos/storage/projections/index_ops_store.py +6 -0
- topos/storage/projections/vector_index_store.py +6 -0
- topos/storage/raw/__init__.py +1 -0
- topos/storage/raw/browser_flat_tables.py +303 -0
- topos/storage/raw/file_store.py +100 -0
- topos/storage/raw/raw_store.py +29 -0
- topos/storage/raw/raw_tables_manager.py +295 -0
- topos/storage/raw/sqlite_raw_store.py +17 -0
- topos/storage/security/encryption.py +21 -0
- topos/storage/signal_identity.py +71 -0
- topos/storage/source_settings.py +116 -0
- topos/storage/user_identity.py +69 -0
- topos/sync/__init__.py +5 -0
- topos/sync/client.py +272 -0
- topos/sync_handlers.py +70 -0
- topos/testing/__init__.py +1 -0
- topos/testing/lifespan.py +7 -0
- topos/uma_contact_enrichment.py +1032 -0
- topos/uma_filters.py +669 -0
- topos/uma_resource_id.py +24 -0
- topos/uma_rpt.py +69 -0
- topos/utils/base_object.py +61 -0
- topos/websocket_client.py +21 -0
- topos_node-0.1.0.dist-info/METADATA +199 -0
- topos_node-0.1.0.dist-info/RECORD +249 -0
- topos_node-0.1.0.dist-info/WHEEL +5 -0
- topos_node-0.1.0.dist-info/entry_points.txt +2 -0
- topos_node-0.1.0.dist-info/licenses/LICENSE +201 -0
- topos_node-0.1.0.dist-info/top_level.txt +2 -0
|
@@ -0,0 +1,1032 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Stage 11: UMA message contact participation + display name resolution.
|
|
3
|
+
|
|
4
|
+
Requires dataset_id and DB connection. Skips all logic when dataset_id is missing.
|
|
5
|
+
"""
|
|
6
|
+
|
|
7
|
+
from __future__ import annotations
|
|
8
|
+
|
|
9
|
+
import json
|
|
10
|
+
import logging
|
|
11
|
+
from collections import defaultdict
|
|
12
|
+
from typing import Any, Dict, List, Optional, Set, Tuple
|
|
13
|
+
|
|
14
|
+
from shared.filtering import FilterManifest
|
|
15
|
+
|
|
16
|
+
from topos.analytics.messenger_labels import _identifier_candidates, resolve_participant_labels
|
|
17
|
+
from topos.contacts.identity import normalize_contact_key
|
|
18
|
+
from topos.storage.canonical.conversations_tables import CONTACT_IDENTIFIERS_TABLE, CONTACTS_TABLE
|
|
19
|
+
from topos.storage.user_identity import get_user_identity
|
|
20
|
+
|
|
21
|
+
logger = logging.getLogger("topos.uma_contact_enrichment")
|
|
22
|
+
|
|
23
|
+
DEFAULT_SHARING_POLICY = {"name_visibility": "normal", "row_visibility": "exclude_from_grants"}
|
|
24
|
+
|
|
25
|
+
|
|
26
|
+
def _table_exists(conn, table_name: str) -> bool:
|
|
27
|
+
try:
|
|
28
|
+
cur = conn.execute(
|
|
29
|
+
"SELECT name FROM sqlite_master WHERE type='table' AND name=?",
|
|
30
|
+
(table_name,),
|
|
31
|
+
)
|
|
32
|
+
return cur.fetchone() is not None
|
|
33
|
+
except Exception:
|
|
34
|
+
return False
|
|
35
|
+
|
|
36
|
+
|
|
37
|
+
def strip_contact_runtime_filters(manifest: Optional[FilterManifest]) -> Optional[FilterManifest]:
|
|
38
|
+
"""Remove filters handled in this module so apply_filter_manifest does not double-apply or error."""
|
|
39
|
+
if manifest is None:
|
|
40
|
+
return None
|
|
41
|
+
skip = {"message_contact_participation", "contact_display_names"}
|
|
42
|
+
kept = [f for f in manifest.filters if f.filter_id not in skip]
|
|
43
|
+
if len(kept) == len(manifest.filters):
|
|
44
|
+
return manifest
|
|
45
|
+
return manifest.model_copy(update={"filters": kept})
|
|
46
|
+
|
|
47
|
+
|
|
48
|
+
def _parse_sharing_policy(raw: Any) -> Dict[str, str]:
|
|
49
|
+
if raw is None or raw == "":
|
|
50
|
+
return dict(DEFAULT_SHARING_POLICY)
|
|
51
|
+
if isinstance(raw, dict):
|
|
52
|
+
base = dict(DEFAULT_SHARING_POLICY)
|
|
53
|
+
base.update({k: str(v) for k, v in raw.items() if k in ("name_visibility", "row_visibility")})
|
|
54
|
+
return base
|
|
55
|
+
try:
|
|
56
|
+
d = json.loads(str(raw))
|
|
57
|
+
if isinstance(d, dict):
|
|
58
|
+
base = dict(DEFAULT_SHARING_POLICY)
|
|
59
|
+
base.update({k: str(v) for k, v in d.items() if k in ("name_visibility", "row_visibility")})
|
|
60
|
+
return base
|
|
61
|
+
except Exception:
|
|
62
|
+
pass
|
|
63
|
+
return dict(DEFAULT_SHARING_POLICY)
|
|
64
|
+
|
|
65
|
+
|
|
66
|
+
def _contact_ids_for_literal_self_sender(id_mm: Dict[str, Set[str]]) -> Set[str]:
|
|
67
|
+
"""Contact IDs tied to the iMessage sender handle ``self`` (when not using ``is_self`` row)."""
|
|
68
|
+
out: Set[str] = set()
|
|
69
|
+
for key in _identifier_candidates("self"):
|
|
70
|
+
got = id_mm.get(key)
|
|
71
|
+
if got:
|
|
72
|
+
out.update(got)
|
|
73
|
+
return out
|
|
74
|
+
|
|
75
|
+
|
|
76
|
+
def build_identifier_contact_multimap(
|
|
77
|
+
conn, dataset_id: str, _source_ids: Optional[Set[str]] = None
|
|
78
|
+
) -> Dict[str, Set[str]]:
|
|
79
|
+
"""
|
|
80
|
+
Map each lookup key -> set(contact_id) for resolving message sender_id.
|
|
81
|
+
|
|
82
|
+
Uses the same phone/email candidate expansion as the social graph (``_identifier_candidates``)
|
|
83
|
+
so E.164 (+1…), 10-digit NANP, and stored identifier strings align.
|
|
84
|
+
|
|
85
|
+
**Multimap (not single-valued):** duplicate imports often create two ``contact_id`` rows for
|
|
86
|
+
the same NANP phone (e.g. ``+1512…`` vs ``512…``). Those rows share expanded keys; a
|
|
87
|
+
first-wins ``Dict[str, str]`` drops one side so ``pick_representative_contact_id`` never sees
|
|
88
|
+
the named card. Every ``(key, contact_id)`` pair from identifiers is recorded here.
|
|
89
|
+
|
|
90
|
+
Loads **all** ``contact_identifiers`` rows for ``dataset_id``. ``_source_ids`` is ignored.
|
|
91
|
+
"""
|
|
92
|
+
if not conn or not dataset_id:
|
|
93
|
+
return {}
|
|
94
|
+
if not _table_exists(conn, CONTACT_IDENTIFIERS_TABLE):
|
|
95
|
+
return {}
|
|
96
|
+
mm: Dict[str, Set[str]] = defaultdict(set)
|
|
97
|
+
rows = conn.execute(
|
|
98
|
+
f"""
|
|
99
|
+
SELECT identifier, contact_id
|
|
100
|
+
FROM {CONTACT_IDENTIFIERS_TABLE}
|
|
101
|
+
WHERE dataset_id = ?
|
|
102
|
+
""",
|
|
103
|
+
(dataset_id,),
|
|
104
|
+
).fetchall()
|
|
105
|
+
for ident, cid in rows:
|
|
106
|
+
if not ident or not cid:
|
|
107
|
+
continue
|
|
108
|
+
i = str(ident).strip()
|
|
109
|
+
c = str(cid).strip()
|
|
110
|
+
for key in _identifier_candidates(i):
|
|
111
|
+
if key:
|
|
112
|
+
mm[key].add(c)
|
|
113
|
+
nk = normalize_contact_key(i)
|
|
114
|
+
if nk:
|
|
115
|
+
mm[nk].add(c)
|
|
116
|
+
mm[i].add(c)
|
|
117
|
+
return dict(mm)
|
|
118
|
+
|
|
119
|
+
|
|
120
|
+
def _nanp_digit_lookup_keys(digits: str) -> Set[str]:
|
|
121
|
+
"""Link US NANP handles that differ only by formatting or leading country code 1."""
|
|
122
|
+
d = digits
|
|
123
|
+
if len(d) < 10:
|
|
124
|
+
return set()
|
|
125
|
+
keys: Set[str] = {d[-10:]}
|
|
126
|
+
if len(d) == 11 and d[0] == "1":
|
|
127
|
+
keys.add(d[1:])
|
|
128
|
+
keys.add(d)
|
|
129
|
+
return keys
|
|
130
|
+
|
|
131
|
+
|
|
132
|
+
def _nanp_lookup_keys_for_value(value: Any) -> Set[str]:
|
|
133
|
+
d = "".join(ch for ch in str(value or "") if ch.isdigit())
|
|
134
|
+
return _nanp_digit_lookup_keys(d)
|
|
135
|
+
|
|
136
|
+
|
|
137
|
+
def build_nanp_digit_contact_index(conn, dataset_id: str) -> Dict[str, Set[str]]:
|
|
138
|
+
"""
|
|
139
|
+
Map digit-derived keys -> contact_ids so differently formatted phone rows still merge
|
|
140
|
+
(e.g. ``+1512…`` vs ``(512) …`` vs ``512…``) when string keys in the multimap diverge.
|
|
141
|
+
"""
|
|
142
|
+
if not conn or not dataset_id:
|
|
143
|
+
return {}
|
|
144
|
+
idx: Dict[str, Set[str]] = defaultdict(set)
|
|
145
|
+
try:
|
|
146
|
+
rows = conn.execute(
|
|
147
|
+
f"""
|
|
148
|
+
SELECT identifier, contact_id
|
|
149
|
+
FROM {CONTACT_IDENTIFIERS_TABLE}
|
|
150
|
+
WHERE dataset_id = ?
|
|
151
|
+
""",
|
|
152
|
+
(dataset_id,),
|
|
153
|
+
).fetchall()
|
|
154
|
+
except Exception as exc: # noqa: BLE001
|
|
155
|
+
logger.warning("build_nanp_digit_contact_index failed: %s", exc)
|
|
156
|
+
return {}
|
|
157
|
+
for ident, cid in rows:
|
|
158
|
+
if not ident or not cid:
|
|
159
|
+
continue
|
|
160
|
+
i = str(ident).strip()
|
|
161
|
+
c = str(cid).strip()
|
|
162
|
+
for k in _nanp_lookup_keys_for_value(i):
|
|
163
|
+
idx[k].add(c)
|
|
164
|
+
return dict(idx)
|
|
165
|
+
|
|
166
|
+
|
|
167
|
+
def _uma_graph_display_name_for_row(
|
|
168
|
+
graph_labels: Dict[str, Dict[str, str]],
|
|
169
|
+
row: Dict[str, Any],
|
|
170
|
+
) -> str:
|
|
171
|
+
"""Best-effort display_name from :func:`resolve_participant_labels` for this message's handles."""
|
|
172
|
+
sid = str(row.get("sender_id") or "").strip()
|
|
173
|
+
if not sid or sid.lower() == "self":
|
|
174
|
+
return ""
|
|
175
|
+
keys: List[str] = [sid]
|
|
176
|
+
alt = _metadata_chat_identifier(row)
|
|
177
|
+
if alt:
|
|
178
|
+
a = str(alt).strip()
|
|
179
|
+
if a and a not in keys:
|
|
180
|
+
keys.append(a)
|
|
181
|
+
for key in keys:
|
|
182
|
+
g = graph_labels.get(key, {})
|
|
183
|
+
dn = (g.get("display_name") or "").strip()
|
|
184
|
+
if dn:
|
|
185
|
+
return dn
|
|
186
|
+
return ""
|
|
187
|
+
|
|
188
|
+
|
|
189
|
+
def _graph_display_name_respects_name_policy(
|
|
190
|
+
graph_dn: str,
|
|
191
|
+
cids: Set[str],
|
|
192
|
+
display_names: Dict[str, Optional[str]],
|
|
193
|
+
name_block: Set[str],
|
|
194
|
+
) -> str:
|
|
195
|
+
"""
|
|
196
|
+
Only use graph-resolved names that correspond to a contact on this row whose name may be shown.
|
|
197
|
+
|
|
198
|
+
``resolve_participant_labels`` ignores sharing_policy; we still must not surface a name marked hidden.
|
|
199
|
+
"""
|
|
200
|
+
g = (graph_dn or "").strip()
|
|
201
|
+
if not g or not cids:
|
|
202
|
+
return ""
|
|
203
|
+
holders = {c for c in cids if (display_names.get(c) or "").strip() == g}
|
|
204
|
+
if not holders:
|
|
205
|
+
return ""
|
|
206
|
+
if holders & name_block:
|
|
207
|
+
return ""
|
|
208
|
+
return g
|
|
209
|
+
|
|
210
|
+
|
|
211
|
+
def _owner_graph_display_name(
|
|
212
|
+
graph_labels: Dict[str, Dict[str, str]],
|
|
213
|
+
self_contact_id: str,
|
|
214
|
+
) -> str:
|
|
215
|
+
"""
|
|
216
|
+
``resolve_participant_labels`` result keyed by the owner's ``contact_id``.
|
|
217
|
+
|
|
218
|
+
Include ``self_contact_id`` in the graph batch so duplicate cards / identifier promotion
|
|
219
|
+
can supply a name when ``contacts.display_name`` on the ``is_self`` row is empty.
|
|
220
|
+
"""
|
|
221
|
+
g = graph_labels.get(self_contact_id, {})
|
|
222
|
+
dn = (g.get("display_name") or "").strip()
|
|
223
|
+
if dn and not _is_imessage_self_sentinel_label(dn):
|
|
224
|
+
return dn
|
|
225
|
+
lab = (g.get("label") or "").strip()
|
|
226
|
+
if not lab or _is_imessage_self_sentinel_label(lab) or lab == self_contact_id:
|
|
227
|
+
return ""
|
|
228
|
+
return lab
|
|
229
|
+
|
|
230
|
+
|
|
231
|
+
def _metadata_chat_identifier(row: Dict[str, Any]) -> Optional[str]:
|
|
232
|
+
"""iMessage-style metadata often duplicates the peer handle in ``chat_identifier``."""
|
|
233
|
+
mj = row.get("metadata_json")
|
|
234
|
+
if mj is None:
|
|
235
|
+
return None
|
|
236
|
+
if isinstance(mj, str):
|
|
237
|
+
try:
|
|
238
|
+
mj = json.loads(mj)
|
|
239
|
+
except Exception:
|
|
240
|
+
return None
|
|
241
|
+
if not isinstance(mj, dict):
|
|
242
|
+
return None
|
|
243
|
+
for key in ("chat_identifier", "handle"):
|
|
244
|
+
v = mj.get(key)
|
|
245
|
+
if v is not None and str(v).strip():
|
|
246
|
+
return str(v).strip()
|
|
247
|
+
return None
|
|
248
|
+
|
|
249
|
+
|
|
250
|
+
def message_row_contact_id(
|
|
251
|
+
row: Dict[str, Any],
|
|
252
|
+
id_mm: Dict[str, Set[str]],
|
|
253
|
+
*,
|
|
254
|
+
self_contact_id: Optional[str] = None,
|
|
255
|
+
nanp_idx: Optional[Dict[str, Set[str]]] = None,
|
|
256
|
+
) -> Optional[str]:
|
|
257
|
+
"""Resolve contact_id from sender_id, then from metadata_json when needed."""
|
|
258
|
+
cids = collect_message_contact_ids(row, id_mm, self_contact_id=self_contact_id, nanp_idx=nanp_idx)
|
|
259
|
+
if not cids:
|
|
260
|
+
return None
|
|
261
|
+
if len(cids) == 1:
|
|
262
|
+
return next(iter(cids))
|
|
263
|
+
# Ambiguous without contact meta; callers that need a single id should use
|
|
264
|
+
# pick_representative_contact_id after load_contact_meta.
|
|
265
|
+
for key in _ordered_identifier_lookup_keys(row.get("sender_id")):
|
|
266
|
+
got = id_mm.get(key)
|
|
267
|
+
if got:
|
|
268
|
+
return min(got)
|
|
269
|
+
alt = _metadata_chat_identifier(row)
|
|
270
|
+
if alt:
|
|
271
|
+
for key in _ordered_identifier_lookup_keys(alt):
|
|
272
|
+
got = id_mm.get(key)
|
|
273
|
+
if got:
|
|
274
|
+
return min(got)
|
|
275
|
+
return min(cids)
|
|
276
|
+
|
|
277
|
+
|
|
278
|
+
def _ordered_identifier_lookup_keys(sender_id: Any) -> List[str]:
|
|
279
|
+
"""Deterministic key order for first-hit fallback (before meta-aware pick)."""
|
|
280
|
+
if sender_id is None:
|
|
281
|
+
return []
|
|
282
|
+
s = str(sender_id).strip()
|
|
283
|
+
if not s:
|
|
284
|
+
return []
|
|
285
|
+
keys: List[str] = []
|
|
286
|
+
seen: Set[str] = set()
|
|
287
|
+
|
|
288
|
+
def add(k: str) -> None:
|
|
289
|
+
if k and k not in seen:
|
|
290
|
+
seen.add(k)
|
|
291
|
+
keys.append(k)
|
|
292
|
+
|
|
293
|
+
add(s)
|
|
294
|
+
add(s.lower())
|
|
295
|
+
nk = normalize_contact_key(s)
|
|
296
|
+
add(nk)
|
|
297
|
+
for k in sorted(_identifier_candidates(s)):
|
|
298
|
+
add(k)
|
|
299
|
+
return keys
|
|
300
|
+
|
|
301
|
+
|
|
302
|
+
def collect_contact_ids_for_sender(
|
|
303
|
+
sender_id: Any,
|
|
304
|
+
id_mm: Dict[str, Set[str]],
|
|
305
|
+
*,
|
|
306
|
+
self_contact_id: Optional[str] = None,
|
|
307
|
+
nanp_idx: Optional[Dict[str, Set[str]]] = None,
|
|
308
|
+
) -> Set[str]:
|
|
309
|
+
"""All contact_ids that match any stored identifier key for this sender."""
|
|
310
|
+
if sender_id is None:
|
|
311
|
+
return set()
|
|
312
|
+
s = str(sender_id).strip()
|
|
313
|
+
if not s:
|
|
314
|
+
return set()
|
|
315
|
+
if s.lower() == "self":
|
|
316
|
+
out: Set[str] = set()
|
|
317
|
+
if self_contact_id:
|
|
318
|
+
out.add(self_contact_id)
|
|
319
|
+
for key in _identifier_candidates(s):
|
|
320
|
+
got = id_mm.get(key)
|
|
321
|
+
if got:
|
|
322
|
+
out.update(got)
|
|
323
|
+
return out
|
|
324
|
+
cids: Set[str] = set()
|
|
325
|
+
for key in _identifier_candidates(s):
|
|
326
|
+
got = id_mm.get(key)
|
|
327
|
+
if got:
|
|
328
|
+
cids.update(got)
|
|
329
|
+
nk = normalize_contact_key(s)
|
|
330
|
+
if nk:
|
|
331
|
+
got = id_mm.get(nk)
|
|
332
|
+
if got:
|
|
333
|
+
cids.update(got)
|
|
334
|
+
if nanp_idx:
|
|
335
|
+
for k in _nanp_lookup_keys_for_value(s):
|
|
336
|
+
got = nanp_idx.get(k)
|
|
337
|
+
if got:
|
|
338
|
+
cids.update(got)
|
|
339
|
+
return cids
|
|
340
|
+
|
|
341
|
+
|
|
342
|
+
def collect_message_contact_ids(
|
|
343
|
+
row: Dict[str, Any],
|
|
344
|
+
id_mm: Dict[str, Set[str]],
|
|
345
|
+
*,
|
|
346
|
+
self_contact_id: Optional[str] = None,
|
|
347
|
+
nanp_idx: Optional[Dict[str, Set[str]]] = None,
|
|
348
|
+
) -> Set[str]:
|
|
349
|
+
"""Union of contact_ids from sender_id and metadata_json handles."""
|
|
350
|
+
cids: Set[str] = set()
|
|
351
|
+
cids.update(
|
|
352
|
+
collect_contact_ids_for_sender(
|
|
353
|
+
row.get("sender_id"),
|
|
354
|
+
id_mm,
|
|
355
|
+
self_contact_id=self_contact_id,
|
|
356
|
+
nanp_idx=nanp_idx,
|
|
357
|
+
)
|
|
358
|
+
)
|
|
359
|
+
alt = _metadata_chat_identifier(row)
|
|
360
|
+
if alt:
|
|
361
|
+
cids.update(
|
|
362
|
+
collect_contact_ids_for_sender(alt, id_mm, self_contact_id=self_contact_id, nanp_idx=nanp_idx)
|
|
363
|
+
)
|
|
364
|
+
return cids
|
|
365
|
+
|
|
366
|
+
|
|
367
|
+
def sender_row_contact_id(
|
|
368
|
+
sender_id: Any,
|
|
369
|
+
id_mm: Dict[str, Set[str]],
|
|
370
|
+
*,
|
|
371
|
+
self_contact_id: Optional[str] = None,
|
|
372
|
+
nanp_idx: Optional[Dict[str, Set[str]]] = None,
|
|
373
|
+
) -> Optional[str]:
|
|
374
|
+
if sender_id is None:
|
|
375
|
+
return None
|
|
376
|
+
s = str(sender_id).strip()
|
|
377
|
+
if not s:
|
|
378
|
+
return None
|
|
379
|
+
cids = collect_contact_ids_for_sender(
|
|
380
|
+
sender_id,
|
|
381
|
+
id_mm,
|
|
382
|
+
self_contact_id=self_contact_id,
|
|
383
|
+
nanp_idx=nanp_idx,
|
|
384
|
+
)
|
|
385
|
+
if not cids:
|
|
386
|
+
return None
|
|
387
|
+
if len(cids) == 1:
|
|
388
|
+
return next(iter(cids))
|
|
389
|
+
for key in _ordered_identifier_lookup_keys(sender_id):
|
|
390
|
+
got = id_mm.get(key)
|
|
391
|
+
if got:
|
|
392
|
+
return min(got)
|
|
393
|
+
return min(cids)
|
|
394
|
+
|
|
395
|
+
|
|
396
|
+
def _has_letter(s: str) -> bool:
|
|
397
|
+
return any(ch.isalpha() for ch in s)
|
|
398
|
+
|
|
399
|
+
|
|
400
|
+
def _is_imessage_self_sentinel_label(value: str) -> bool:
|
|
401
|
+
"""True for the iMessage owner placeholder string; not a human-readable display name."""
|
|
402
|
+
return str(value or "").strip().lower() == "self"
|
|
403
|
+
|
|
404
|
+
|
|
405
|
+
def _contact_label_score(
|
|
406
|
+
cid: str,
|
|
407
|
+
*,
|
|
408
|
+
display_names: Dict[str, Optional[str]],
|
|
409
|
+
known_usernames_by_cid: Dict[str, List[str]],
|
|
410
|
+
fallback_labels: Dict[str, str],
|
|
411
|
+
) -> Tuple[int, int, str]:
|
|
412
|
+
"""
|
|
413
|
+
Return (tier, length, tie_breaker) with higher tier/length better.
|
|
414
|
+
tier: 3 = human-looking display_name, 2 = human username, 1 = any display_name, 0 = phone-like fallback.
|
|
415
|
+
"""
|
|
416
|
+
dn = (display_names.get(cid) or "").strip()
|
|
417
|
+
if _is_imessage_self_sentinel_label(dn):
|
|
418
|
+
dn = ""
|
|
419
|
+
tier = 0
|
|
420
|
+
best = ""
|
|
421
|
+
if dn:
|
|
422
|
+
if _has_letter(dn):
|
|
423
|
+
tier = 3
|
|
424
|
+
best = dn
|
|
425
|
+
else:
|
|
426
|
+
tier = 1
|
|
427
|
+
best = dn
|
|
428
|
+
if tier < 2:
|
|
429
|
+
for u in known_usernames_by_cid.get(cid) or []:
|
|
430
|
+
uu = str(u).strip()
|
|
431
|
+
if uu and not _is_imessage_self_sentinel_label(uu) and _has_letter(uu):
|
|
432
|
+
tier = max(tier, 2)
|
|
433
|
+
if len(uu) > len(best):
|
|
434
|
+
best = uu
|
|
435
|
+
if tier == 0:
|
|
436
|
+
fb = (fallback_labels.get(cid) or "").strip()
|
|
437
|
+
if fb and not _is_imessage_self_sentinel_label(fb):
|
|
438
|
+
best = fb
|
|
439
|
+
return (tier, len(best), best or cid)
|
|
440
|
+
|
|
441
|
+
|
|
442
|
+
def _row_hidden_by_default_policy(
|
|
443
|
+
cid: str,
|
|
444
|
+
policies: Dict[str, Dict[str, str]],
|
|
445
|
+
inherit_defaults: bool,
|
|
446
|
+
) -> bool:
|
|
447
|
+
if not inherit_defaults:
|
|
448
|
+
return False
|
|
449
|
+
pol = policies.get(cid)
|
|
450
|
+
if pol is None:
|
|
451
|
+
return False
|
|
452
|
+
return pol.get("row_visibility") == "exclude_from_grants"
|
|
453
|
+
|
|
454
|
+
|
|
455
|
+
def pick_representative_contact_id(
|
|
456
|
+
cids: Set[str],
|
|
457
|
+
*,
|
|
458
|
+
display_names: Dict[str, Optional[str]],
|
|
459
|
+
known_usernames_by_cid: Dict[str, List[str]],
|
|
460
|
+
fallback_labels: Dict[str, str],
|
|
461
|
+
policies: Dict[str, Dict[str, str]],
|
|
462
|
+
inherit_defaults: bool,
|
|
463
|
+
) -> Optional[str]:
|
|
464
|
+
"""
|
|
465
|
+
When multiple contact rows share the same phone (e.g. +E.164 vs 10-digit imports),
|
|
466
|
+
prefer the card with a real display name / username over an unnamed duplicate.
|
|
467
|
+
"""
|
|
468
|
+
if not cids:
|
|
469
|
+
return None
|
|
470
|
+
if len(cids) == 1:
|
|
471
|
+
return next(iter(cids))
|
|
472
|
+
ranked: List[Tuple[int, int, int, str]] = []
|
|
473
|
+
for cid in cids:
|
|
474
|
+
tier, length, tie = _contact_label_score(
|
|
475
|
+
cid,
|
|
476
|
+
display_names=display_names,
|
|
477
|
+
known_usernames_by_cid=known_usernames_by_cid,
|
|
478
|
+
fallback_labels=fallback_labels,
|
|
479
|
+
)
|
|
480
|
+
hidden = _row_hidden_by_default_policy(cid, policies, inherit_defaults)
|
|
481
|
+
visibility_boost = 0 if hidden else 1
|
|
482
|
+
ranked.append((visibility_boost, tier, length, cid))
|
|
483
|
+
ranked.sort(reverse=True)
|
|
484
|
+
return ranked[0][3]
|
|
485
|
+
|
|
486
|
+
|
|
487
|
+
def visible_label_for_contact(
|
|
488
|
+
cid: str,
|
|
489
|
+
*,
|
|
490
|
+
display_names: Dict[str, Optional[str]],
|
|
491
|
+
known_usernames_by_cid: Dict[str, List[str]],
|
|
492
|
+
fallback_labels: Dict[str, str],
|
|
493
|
+
) -> str:
|
|
494
|
+
"""Single visible string for a contact_id (display_name → username → identifier)."""
|
|
495
|
+
dn = (display_names.get(cid) or "").strip()
|
|
496
|
+
if dn and not _is_imessage_self_sentinel_label(dn):
|
|
497
|
+
return dn
|
|
498
|
+
for u in known_usernames_by_cid.get(cid) or []:
|
|
499
|
+
uu = str(u).strip()
|
|
500
|
+
if uu and not _is_imessage_self_sentinel_label(uu):
|
|
501
|
+
return uu
|
|
502
|
+
fb = (fallback_labels.get(cid) or "").strip()
|
|
503
|
+
if fb and not _is_imessage_self_sentinel_label(fb):
|
|
504
|
+
return fb
|
|
505
|
+
return ""
|
|
506
|
+
|
|
507
|
+
|
|
508
|
+
def load_self_contact_info(conn: Any, dataset_id: str) -> Tuple[Optional[str], Optional[str]]:
|
|
509
|
+
"""Return (contact_id, display label) for the dataset owner (is_self), if present."""
|
|
510
|
+
if not conn or not dataset_id:
|
|
511
|
+
return None, None
|
|
512
|
+
try:
|
|
513
|
+
row = conn.execute(
|
|
514
|
+
f"""
|
|
515
|
+
SELECT contact_id, display_name, known_usernames_json
|
|
516
|
+
FROM {CONTACTS_TABLE}
|
|
517
|
+
WHERE dataset_id = ? AND is_self = 1
|
|
518
|
+
LIMIT 1
|
|
519
|
+
""",
|
|
520
|
+
(dataset_id,),
|
|
521
|
+
).fetchone()
|
|
522
|
+
except Exception as exc: # noqa: BLE001
|
|
523
|
+
logger.warning("load_self_contact_info failed: %s", exc)
|
|
524
|
+
return None, None
|
|
525
|
+
if not row:
|
|
526
|
+
return None, None
|
|
527
|
+
cid = str(row[0] or "").strip() or None
|
|
528
|
+
dn = str(row[1] or "").strip()
|
|
529
|
+
if dn and not _is_imessage_self_sentinel_label(dn):
|
|
530
|
+
return cid, dn
|
|
531
|
+
raw = row[2]
|
|
532
|
+
try:
|
|
533
|
+
arr = json.loads(raw or "[]")
|
|
534
|
+
if isinstance(arr, list):
|
|
535
|
+
for u in arr:
|
|
536
|
+
uu = str(u).strip()
|
|
537
|
+
if uu and not _is_imessage_self_sentinel_label(uu):
|
|
538
|
+
return cid, uu
|
|
539
|
+
except Exception:
|
|
540
|
+
pass
|
|
541
|
+
return cid, None
|
|
542
|
+
|
|
543
|
+
|
|
544
|
+
def load_user_identity_display_name(conn: Any, dataset_id: str) -> Optional[str]:
|
|
545
|
+
"""Return the canonical owner-authored display name for the dataset, if set."""
|
|
546
|
+
if not conn or not dataset_id:
|
|
547
|
+
return None
|
|
548
|
+
try:
|
|
549
|
+
identity = get_user_identity(conn, dataset_id)
|
|
550
|
+
except Exception as exc: # noqa: BLE001
|
|
551
|
+
logger.warning("load_user_identity_display_name failed: %s", exc)
|
|
552
|
+
return None
|
|
553
|
+
if not identity:
|
|
554
|
+
return None
|
|
555
|
+
dn = str(identity.get("display_name") or "").strip()
|
|
556
|
+
return dn or None
|
|
557
|
+
|
|
558
|
+
|
|
559
|
+
def prefetch_contact_ids_for_conversations(
|
|
560
|
+
conn: Any,
|
|
561
|
+
dataset_id: str,
|
|
562
|
+
conversation_ids: Set[str],
|
|
563
|
+
id_mm: Dict[str, Set[str]],
|
|
564
|
+
*,
|
|
565
|
+
self_contact_id: Optional[str],
|
|
566
|
+
nanp_idx: Optional[Dict[str, Set[str]]] = None,
|
|
567
|
+
) -> Set[str]:
|
|
568
|
+
"""Union of all contact_ids reachable from senders in the given conversations."""
|
|
569
|
+
out: Set[str] = set()
|
|
570
|
+
if not conn or not dataset_id or not conversation_ids:
|
|
571
|
+
return out
|
|
572
|
+
placeholders = ",".join("?" for _ in conversation_ids)
|
|
573
|
+
params: List[Any] = [dataset_id, *sorted(conversation_ids)]
|
|
574
|
+
try:
|
|
575
|
+
rows = conn.execute(
|
|
576
|
+
f"""
|
|
577
|
+
SELECT DISTINCT sender_id
|
|
578
|
+
FROM conversation_messages
|
|
579
|
+
WHERE dataset_id = ? AND conversation_id IN ({placeholders})
|
|
580
|
+
""",
|
|
581
|
+
params,
|
|
582
|
+
).fetchall()
|
|
583
|
+
except Exception as exc: # noqa: BLE001
|
|
584
|
+
logger.warning("prefetch_contact_ids_for_conversations failed: %s", exc)
|
|
585
|
+
return out
|
|
586
|
+
for (sender_id,) in rows:
|
|
587
|
+
out.update(
|
|
588
|
+
collect_contact_ids_for_sender(
|
|
589
|
+
sender_id,
|
|
590
|
+
id_mm,
|
|
591
|
+
self_contact_id=self_contact_id,
|
|
592
|
+
nanp_idx=nanp_idx,
|
|
593
|
+
)
|
|
594
|
+
)
|
|
595
|
+
return out
|
|
596
|
+
|
|
597
|
+
|
|
598
|
+
def load_conversation_participant_contact_ids(
|
|
599
|
+
conn: Any,
|
|
600
|
+
dataset_id: str,
|
|
601
|
+
conversation_ids: Set[str],
|
|
602
|
+
id_mm: Dict[str, Set[str]],
|
|
603
|
+
*,
|
|
604
|
+
self_contact_id: Optional[str],
|
|
605
|
+
nanp_idx: Optional[Dict[str, Set[str]]] = None,
|
|
606
|
+
display_names: Dict[str, Optional[str]],
|
|
607
|
+
known_usernames_by_cid: Dict[str, List[str]],
|
|
608
|
+
fallback_labels: Dict[str, str],
|
|
609
|
+
policies: Dict[str, Dict[str, str]],
|
|
610
|
+
inherit_defaults: bool,
|
|
611
|
+
) -> Dict[str, Set[str]]:
|
|
612
|
+
"""
|
|
613
|
+
Return mapping conversation_id -> set(contact_id) using sender_id values from
|
|
614
|
+
conversation_messages within the same dataset.
|
|
615
|
+
"""
|
|
616
|
+
out: Dict[str, Set[str]] = {}
|
|
617
|
+
if not conn or not dataset_id or not conversation_ids:
|
|
618
|
+
return out
|
|
619
|
+
placeholders = ",".join("?" for _ in conversation_ids)
|
|
620
|
+
params: List[Any] = [dataset_id, *sorted(conversation_ids)]
|
|
621
|
+
try:
|
|
622
|
+
rows = conn.execute(
|
|
623
|
+
f"""
|
|
624
|
+
SELECT conversation_id, sender_id
|
|
625
|
+
FROM conversation_messages
|
|
626
|
+
WHERE dataset_id = ? AND conversation_id IN ({placeholders})
|
|
627
|
+
""",
|
|
628
|
+
params,
|
|
629
|
+
).fetchall()
|
|
630
|
+
except Exception as exc: # noqa: BLE001
|
|
631
|
+
logger.warning("load_conversation_participant_contact_ids failed: %s", exc)
|
|
632
|
+
return out
|
|
633
|
+
|
|
634
|
+
for conv_id, sender_id in rows:
|
|
635
|
+
conv = str(conv_id or "").strip()
|
|
636
|
+
if not conv:
|
|
637
|
+
continue
|
|
638
|
+
cids = collect_contact_ids_for_sender(
|
|
639
|
+
sender_id,
|
|
640
|
+
id_mm,
|
|
641
|
+
self_contact_id=self_contact_id,
|
|
642
|
+
nanp_idx=nanp_idx,
|
|
643
|
+
)
|
|
644
|
+
cid = (
|
|
645
|
+
pick_representative_contact_id(
|
|
646
|
+
cids,
|
|
647
|
+
display_names=display_names,
|
|
648
|
+
known_usernames_by_cid=known_usernames_by_cid,
|
|
649
|
+
fallback_labels=fallback_labels,
|
|
650
|
+
policies=policies,
|
|
651
|
+
inherit_defaults=inherit_defaults,
|
|
652
|
+
)
|
|
653
|
+
if cids
|
|
654
|
+
else None
|
|
655
|
+
)
|
|
656
|
+
if not cid:
|
|
657
|
+
continue
|
|
658
|
+
if conv not in out:
|
|
659
|
+
out[conv] = set()
|
|
660
|
+
out[conv].add(cid)
|
|
661
|
+
return out
|
|
662
|
+
|
|
663
|
+
|
|
664
|
+
def load_contact_meta(
|
|
665
|
+
conn,
|
|
666
|
+
dataset_id: str,
|
|
667
|
+
contact_ids: Set[str],
|
|
668
|
+
) -> Tuple[Dict[str, Dict[str, str]], Dict[str, Optional[str]], Dict[str, List[str]]]:
|
|
669
|
+
"""Return (sharing_policy_by_cid, display_name_by_cid, known_usernames_by_cid)."""
|
|
670
|
+
policies: Dict[str, Dict[str, str]] = {}
|
|
671
|
+
names: Dict[str, Optional[str]] = {}
|
|
672
|
+
usernames: Dict[str, List[str]] = {}
|
|
673
|
+
if not conn or not dataset_id or not contact_ids:
|
|
674
|
+
return policies, names, usernames
|
|
675
|
+
placeholders = ",".join("?" for _ in contact_ids)
|
|
676
|
+
params: List[Any] = [dataset_id, *sorted(contact_ids)]
|
|
677
|
+
try:
|
|
678
|
+
rows = conn.execute(
|
|
679
|
+
f"""
|
|
680
|
+
SELECT contact_id, display_name, sharing_policy_json, known_usernames_json
|
|
681
|
+
FROM {CONTACTS_TABLE}
|
|
682
|
+
WHERE dataset_id = ? AND contact_id IN ({placeholders})
|
|
683
|
+
""",
|
|
684
|
+
params,
|
|
685
|
+
).fetchall()
|
|
686
|
+
except Exception as exc: # noqa: BLE001
|
|
687
|
+
logger.warning("load_contact_meta failed: %s", exc)
|
|
688
|
+
return policies, names, usernames
|
|
689
|
+
for cid, dname, pol, raw_users in rows:
|
|
690
|
+
c = str(cid or "").strip()
|
|
691
|
+
if not c:
|
|
692
|
+
continue
|
|
693
|
+
names[c] = str(dname).strip() if dname else None
|
|
694
|
+
policies[c] = _parse_sharing_policy(pol)
|
|
695
|
+
parsed: List[str] = []
|
|
696
|
+
try:
|
|
697
|
+
arr = json.loads(raw_users or "[]")
|
|
698
|
+
if isinstance(arr, list):
|
|
699
|
+
parsed = [str(v).strip() for v in arr if str(v).strip()]
|
|
700
|
+
except Exception:
|
|
701
|
+
parsed = []
|
|
702
|
+
usernames[c] = parsed
|
|
703
|
+
return policies, names, usernames
|
|
704
|
+
|
|
705
|
+
|
|
706
|
+
def load_identifier_fallback_labels(
|
|
707
|
+
conn: Any,
|
|
708
|
+
dataset_id: str,
|
|
709
|
+
contact_ids: Set[str],
|
|
710
|
+
) -> Dict[str, str]:
|
|
711
|
+
"""
|
|
712
|
+
When ``contacts.display_name`` is empty, use a primary identifier row as the visible label.
|
|
713
|
+
|
|
714
|
+
Avoids returning no ``sender_display_name`` when the address book row exists but the card
|
|
715
|
+
name was never synced into ``display_name``.
|
|
716
|
+
"""
|
|
717
|
+
if not conn or not dataset_id or not contact_ids:
|
|
718
|
+
return {}
|
|
719
|
+
out: Dict[str, str] = {}
|
|
720
|
+
placeholders = ",".join("?" * len(contact_ids))
|
|
721
|
+
params: List[Any] = [dataset_id, *sorted(contact_ids)]
|
|
722
|
+
try:
|
|
723
|
+
rows = conn.execute(
|
|
724
|
+
f"""
|
|
725
|
+
SELECT contact_id, identifier, source_id
|
|
726
|
+
FROM {CONTACT_IDENTIFIERS_TABLE}
|
|
727
|
+
WHERE dataset_id = ? AND contact_id IN ({placeholders})
|
|
728
|
+
ORDER BY CASE WHEN source_id = '*' THEN 1 ELSE 0 END ASC, updated_at DESC
|
|
729
|
+
""",
|
|
730
|
+
params,
|
|
731
|
+
).fetchall()
|
|
732
|
+
except Exception as exc: # noqa: BLE001
|
|
733
|
+
logger.warning("load_identifier_fallback_labels failed: %s", exc)
|
|
734
|
+
return out
|
|
735
|
+
for cid_raw, ident, _src in rows:
|
|
736
|
+
c = str(cid_raw or "").strip()
|
|
737
|
+
i = str(ident or "").strip()
|
|
738
|
+
if not c or not i or c in out:
|
|
739
|
+
continue
|
|
740
|
+
out[c] = i
|
|
741
|
+
return out
|
|
742
|
+
|
|
743
|
+
|
|
744
|
+
def apply_message_contact_pipeline(
|
|
745
|
+
items: List[Dict[str, Any]],
|
|
746
|
+
*,
|
|
747
|
+
conn: Any,
|
|
748
|
+
dataset_id: Optional[str],
|
|
749
|
+
allowed_scopes: List[str],
|
|
750
|
+
manifest: Optional[FilterManifest],
|
|
751
|
+
filters: Optional[Dict[str, Any]],
|
|
752
|
+
) -> Tuple[List[Dict[str, Any]], Dict[str, Any]]:
|
|
753
|
+
"""
|
|
754
|
+
Apply message_contact_participation, owner sharing_policy row exclusion, grant block/allow lists,
|
|
755
|
+
and optional sender_display_name enrichment when contacts:resolve + contact_display_names.
|
|
756
|
+
|
|
757
|
+
Returns ``(rows, sidecar)``. ``sidecar["message_owner"]`` describes the dataset owner so clients
|
|
758
|
+
can label owner-authored rows (see per-row ``sender_is_owner`` and ``is_from_self``).
|
|
759
|
+
"""
|
|
760
|
+
if not items or not conn or not dataset_id:
|
|
761
|
+
return items, {}
|
|
762
|
+
|
|
763
|
+
scope_set = {str(s).strip() for s in (allowed_scopes or []) if s}
|
|
764
|
+
can_resolve = "contacts:resolve" in scope_set
|
|
765
|
+
|
|
766
|
+
participation = manifest.get_filter("message_contact_participation") if manifest else None
|
|
767
|
+
name_filter = manifest.get_filter("contact_display_names") if manifest else None
|
|
768
|
+
# If contacts:resolve is granted, enrich names by default unless the manifest explicitly disables it.
|
|
769
|
+
names_enabled = can_resolve
|
|
770
|
+
if name_filter is not None:
|
|
771
|
+
names_enabled = bool(name_filter.params.get("enabled"))
|
|
772
|
+
|
|
773
|
+
cgp = (filters or {}).get("contact_grant_policy") if isinstance(filters, dict) else None
|
|
774
|
+
cgp = cgp if isinstance(cgp, dict) else {}
|
|
775
|
+
inherit_defaults = bool(cgp.get("inherit_contact_defaults", True))
|
|
776
|
+
grant_block: Set[str] = {str(x).strip() for x in (cgp.get("blocklist_contact_ids") or []) if str(x).strip()}
|
|
777
|
+
grant_allow: Set[str] = {str(x).strip() for x in (cgp.get("allowlist_contact_ids") or []) if str(x).strip()}
|
|
778
|
+
|
|
779
|
+
source_ids = {str(row.get("source_id") or "").strip() for row in items if row.get("source_id")}
|
|
780
|
+
id_mm = build_identifier_contact_multimap(conn, dataset_id, source_ids)
|
|
781
|
+
nanp_idx = build_nanp_digit_contact_index(conn, dataset_id)
|
|
782
|
+
if not id_mm:
|
|
783
|
+
logger.info(
|
|
784
|
+
"UMA contact enrichment: no rows in %s for dataset_id=%s; cannot resolve senders to contacts",
|
|
785
|
+
CONTACT_IDENTIFIERS_TABLE,
|
|
786
|
+
dataset_id[:48] if dataset_id else "",
|
|
787
|
+
)
|
|
788
|
+
|
|
789
|
+
canonical_owner_display_name = load_user_identity_display_name(conn, dataset_id)
|
|
790
|
+
self_contact_id, _self_label = load_self_contact_info(conn, dataset_id)
|
|
791
|
+
literal_self_cids = _contact_ids_for_literal_self_sender(id_mm)
|
|
792
|
+
|
|
793
|
+
conversation_ids_in_page: Set[str] = set()
|
|
794
|
+
contact_ids_in_page: Set[str] = set()
|
|
795
|
+
for row in items:
|
|
796
|
+
conv = str(row.get("conversation_id") or row.get("thread_id") or "").strip()
|
|
797
|
+
if conv:
|
|
798
|
+
conversation_ids_in_page.add(conv)
|
|
799
|
+
contact_ids_in_page.update(
|
|
800
|
+
collect_message_contact_ids(row, id_mm, self_contact_id=self_contact_id, nanp_idx=nanp_idx)
|
|
801
|
+
)
|
|
802
|
+
if self_contact_id:
|
|
803
|
+
contact_ids_in_page.add(self_contact_id)
|
|
804
|
+
else:
|
|
805
|
+
contact_ids_in_page.update(literal_self_cids)
|
|
806
|
+
contact_ids_in_page.update(
|
|
807
|
+
prefetch_contact_ids_for_conversations(
|
|
808
|
+
conn,
|
|
809
|
+
dataset_id,
|
|
810
|
+
conversation_ids_in_page,
|
|
811
|
+
id_mm,
|
|
812
|
+
self_contact_id=self_contact_id,
|
|
813
|
+
nanp_idx=nanp_idx,
|
|
814
|
+
)
|
|
815
|
+
)
|
|
816
|
+
|
|
817
|
+
policies, display_names, known_usernames_by_cid = load_contact_meta(conn, dataset_id, contact_ids_in_page)
|
|
818
|
+
fallback_labels = load_identifier_fallback_labels(conn, dataset_id, contact_ids_in_page)
|
|
819
|
+
|
|
820
|
+
effective_owner_contact_id: Optional[str] = self_contact_id
|
|
821
|
+
if not effective_owner_contact_id and literal_self_cids:
|
|
822
|
+
effective_owner_contact_id = pick_representative_contact_id(
|
|
823
|
+
literal_self_cids,
|
|
824
|
+
display_names=display_names,
|
|
825
|
+
known_usernames_by_cid=known_usernames_by_cid,
|
|
826
|
+
fallback_labels=fallback_labels,
|
|
827
|
+
policies=policies,
|
|
828
|
+
inherit_defaults=inherit_defaults,
|
|
829
|
+
)
|
|
830
|
+
|
|
831
|
+
participant_map = load_conversation_participant_contact_ids(
|
|
832
|
+
conn,
|
|
833
|
+
dataset_id,
|
|
834
|
+
conversation_ids_in_page,
|
|
835
|
+
id_mm,
|
|
836
|
+
self_contact_id=self_contact_id,
|
|
837
|
+
nanp_idx=nanp_idx,
|
|
838
|
+
display_names=display_names,
|
|
839
|
+
known_usernames_by_cid=known_usernames_by_cid,
|
|
840
|
+
fallback_labels=fallback_labels,
|
|
841
|
+
policies=policies,
|
|
842
|
+
inherit_defaults=inherit_defaults,
|
|
843
|
+
)
|
|
844
|
+
|
|
845
|
+
row_block: Set[str] = set(grant_block)
|
|
846
|
+
name_block: Set[str] = set()
|
|
847
|
+
for cid, pol in policies.items():
|
|
848
|
+
if not inherit_defaults:
|
|
849
|
+
continue
|
|
850
|
+
if pol.get("row_visibility") == "exclude_from_grants":
|
|
851
|
+
row_block.add(cid)
|
|
852
|
+
if pol.get("name_visibility") == "hidden":
|
|
853
|
+
name_block.add(cid)
|
|
854
|
+
|
|
855
|
+
mode = "all"
|
|
856
|
+
manifest_block_ids: Set[str] = set()
|
|
857
|
+
manifest_allow_ids: Set[str] = set()
|
|
858
|
+
if participation:
|
|
859
|
+
mode = str(participation.params.get("mode") or "all")
|
|
860
|
+
raw_ids = participation.params.get("contact_ids") or []
|
|
861
|
+
if isinstance(raw_ids, list):
|
|
862
|
+
if mode == "blocklist":
|
|
863
|
+
manifest_block_ids = {str(x).strip() for x in raw_ids if str(x).strip()}
|
|
864
|
+
elif mode == "allowlist":
|
|
865
|
+
manifest_allow_ids = {str(x).strip() for x in raw_ids if str(x).strip()}
|
|
866
|
+
match_mode = "thread_participants"
|
|
867
|
+
if participation:
|
|
868
|
+
match_mode = str(participation.params.get("match") or "thread_participants")
|
|
869
|
+
|
|
870
|
+
row_block |= manifest_block_ids
|
|
871
|
+
|
|
872
|
+
graph_labels: Dict[str, Dict[str, str]] = {}
|
|
873
|
+
graph_pid: Set[str] = set()
|
|
874
|
+
if can_resolve and names_enabled:
|
|
875
|
+
for row in items:
|
|
876
|
+
sid_g = str(row.get("sender_id") or "").strip()
|
|
877
|
+
if sid_g and sid_g.lower() != "self":
|
|
878
|
+
graph_pid.add(sid_g)
|
|
879
|
+
alt_g = _metadata_chat_identifier(row)
|
|
880
|
+
if alt_g:
|
|
881
|
+
aa = str(alt_g).strip()
|
|
882
|
+
if aa:
|
|
883
|
+
graph_pid.add(aa)
|
|
884
|
+
if self_contact_id:
|
|
885
|
+
graph_pid.add(self_contact_id)
|
|
886
|
+
elif literal_self_cids:
|
|
887
|
+
graph_pid.update(literal_self_cids)
|
|
888
|
+
if graph_pid:
|
|
889
|
+
try:
|
|
890
|
+
graph_labels = resolve_participant_labels(
|
|
891
|
+
conn, dataset_id=dataset_id, participant_ids=sorted(graph_pid)
|
|
892
|
+
)
|
|
893
|
+
except Exception as exc: # noqa: BLE001
|
|
894
|
+
logger.warning("resolve_participant_labels for UMA enrichment failed: %s", exc)
|
|
895
|
+
graph_labels = {}
|
|
896
|
+
|
|
897
|
+
out_rows: List[Dict[str, Any]] = []
|
|
898
|
+
for row in items:
|
|
899
|
+
sid = str(row.get("sender_id") or "").strip()
|
|
900
|
+
cids = collect_message_contact_ids(row, id_mm, self_contact_id=self_contact_id, nanp_idx=nanp_idx)
|
|
901
|
+
self_sender_cids = collect_contact_ids_for_sender(
|
|
902
|
+
row.get("sender_id"),
|
|
903
|
+
id_mm,
|
|
904
|
+
self_contact_id=self_contact_id,
|
|
905
|
+
nanp_idx=nanp_idx,
|
|
906
|
+
)
|
|
907
|
+
cid = (
|
|
908
|
+
pick_representative_contact_id(
|
|
909
|
+
cids,
|
|
910
|
+
display_names=display_names,
|
|
911
|
+
known_usernames_by_cid=known_usernames_by_cid,
|
|
912
|
+
fallback_labels=fallback_labels,
|
|
913
|
+
policies=policies,
|
|
914
|
+
inherit_defaults=inherit_defaults,
|
|
915
|
+
)
|
|
916
|
+
if cids
|
|
917
|
+
else None
|
|
918
|
+
)
|
|
919
|
+
# iMessage rows often duplicate the peer handle in metadata; that merges peer + owner into
|
|
920
|
+
# ``cids``. Never treat the peer as the sender when ``sender_id`` is literally ``self``.
|
|
921
|
+
if sid.lower() == "self":
|
|
922
|
+
if self_contact_id and self_contact_id in cids:
|
|
923
|
+
cid = self_contact_id
|
|
924
|
+
elif self_sender_cids:
|
|
925
|
+
cid = pick_representative_contact_id(
|
|
926
|
+
self_sender_cids,
|
|
927
|
+
display_names=display_names,
|
|
928
|
+
known_usernames_by_cid=known_usernames_by_cid,
|
|
929
|
+
fallback_labels=fallback_labels,
|
|
930
|
+
policies=policies,
|
|
931
|
+
inherit_defaults=inherit_defaults,
|
|
932
|
+
)
|
|
933
|
+
else:
|
|
934
|
+
cid = None
|
|
935
|
+
|
|
936
|
+
conv = str(row.get("conversation_id") or row.get("thread_id") or "").strip()
|
|
937
|
+
if match_mode == "sender_only":
|
|
938
|
+
row_contact_ids: Set[str] = {cid} if cid else set()
|
|
939
|
+
else:
|
|
940
|
+
row_contact_ids = set(participant_map.get(conv, set()))
|
|
941
|
+
if cid:
|
|
942
|
+
row_contact_ids.add(cid)
|
|
943
|
+
|
|
944
|
+
if row_contact_ids and row_contact_ids.intersection(row_block):
|
|
945
|
+
continue
|
|
946
|
+
if mode == "allowlist":
|
|
947
|
+
if not manifest_allow_ids:
|
|
948
|
+
continue
|
|
949
|
+
if not row_contact_ids.intersection(manifest_allow_ids):
|
|
950
|
+
continue
|
|
951
|
+
if grant_allow:
|
|
952
|
+
if not row_contact_ids.intersection(grant_allow):
|
|
953
|
+
continue
|
|
954
|
+
|
|
955
|
+
new_row = dict(row)
|
|
956
|
+
raw_graph = ""
|
|
957
|
+
graph_dn = ""
|
|
958
|
+
pipeline_dn = ""
|
|
959
|
+
if can_resolve and names_enabled:
|
|
960
|
+
if sid.lower() == "self" and canonical_owner_display_name:
|
|
961
|
+
new_row["sender_display_name"] = canonical_owner_display_name
|
|
962
|
+
elif cid:
|
|
963
|
+
raw_graph = _uma_graph_display_name_for_row(graph_labels, row) if graph_labels else ""
|
|
964
|
+
graph_dn = _graph_display_name_respects_name_policy(
|
|
965
|
+
raw_graph, cids, display_names, name_block
|
|
966
|
+
)
|
|
967
|
+
pipeline_dn = visible_label_for_contact(
|
|
968
|
+
cid,
|
|
969
|
+
display_names=display_names,
|
|
970
|
+
known_usernames_by_cid=known_usernames_by_cid,
|
|
971
|
+
fallback_labels=fallback_labels,
|
|
972
|
+
)
|
|
973
|
+
if cid not in name_block and cid not in grant_block:
|
|
974
|
+
dn = ""
|
|
975
|
+
owner_graph_key: Optional[str] = None
|
|
976
|
+
if sid.lower() == "self" and cid:
|
|
977
|
+
if self_contact_id is not None and cid == self_contact_id:
|
|
978
|
+
owner_graph_key = self_contact_id
|
|
979
|
+
elif self_contact_id is None and cid in self_sender_cids:
|
|
980
|
+
owner_graph_key = cid
|
|
981
|
+
if owner_graph_key is not None:
|
|
982
|
+
# Prefer graph resolution (identifier promotion across duplicate cards) over
|
|
983
|
+
# pipeline fallback labels, which are often raw phone strings.
|
|
984
|
+
dn = (_self_label or "").strip()
|
|
985
|
+
if not dn and graph_labels:
|
|
986
|
+
dn = _owner_graph_display_name(graph_labels, owner_graph_key)
|
|
987
|
+
if not dn:
|
|
988
|
+
dn = pipeline_dn
|
|
989
|
+
elif graph_dn:
|
|
990
|
+
dn = graph_dn
|
|
991
|
+
else:
|
|
992
|
+
dn = pipeline_dn
|
|
993
|
+
if dn:
|
|
994
|
+
new_row["sender_display_name"] = dn
|
|
995
|
+
new_row["is_from_self"] = bool(new_row.get("is_from_self"))
|
|
996
|
+
sid_lower = sid.strip().lower()
|
|
997
|
+
new_row["sender_is_owner"] = bool(
|
|
998
|
+
sid_lower == "self"
|
|
999
|
+
or (effective_owner_contact_id is not None and cid == effective_owner_contact_id)
|
|
1000
|
+
or new_row.get("is_from_self")
|
|
1001
|
+
)
|
|
1002
|
+
|
|
1003
|
+
out_rows.append(new_row)
|
|
1004
|
+
|
|
1005
|
+
owner_messages_in_response = sum(1 for r in out_rows if r.get("sender_is_owner"))
|
|
1006
|
+
owner_display_for_response: Optional[str] = None
|
|
1007
|
+
if can_resolve and names_enabled:
|
|
1008
|
+
_ocand = (canonical_owner_display_name or "").strip()
|
|
1009
|
+
if not _ocand and effective_owner_contact_id:
|
|
1010
|
+
_ocand = (_self_label or "").strip()
|
|
1011
|
+
if not _ocand and graph_labels:
|
|
1012
|
+
_ocand = _owner_graph_display_name(graph_labels, effective_owner_contact_id)
|
|
1013
|
+
if not _ocand:
|
|
1014
|
+
_ocand = visible_label_for_contact(
|
|
1015
|
+
effective_owner_contact_id,
|
|
1016
|
+
display_names=display_names,
|
|
1017
|
+
known_usernames_by_cid=known_usernames_by_cid,
|
|
1018
|
+
fallback_labels=fallback_labels,
|
|
1019
|
+
)
|
|
1020
|
+
if _ocand and not _is_imessage_self_sentinel_label(_ocand):
|
|
1021
|
+
owner_display_for_response = _ocand
|
|
1022
|
+
owner_uid = (dataset_id or "").split(":", 1)[0] or None
|
|
1023
|
+
sidecar: Dict[str, Any] = {
|
|
1024
|
+
"message_owner": {
|
|
1025
|
+
"owner_user_id": owner_uid,
|
|
1026
|
+
"owner_contact_id": effective_owner_contact_id,
|
|
1027
|
+
"owner_display_name": owner_display_for_response,
|
|
1028
|
+
"owner_messages_in_this_response": owner_messages_in_response,
|
|
1029
|
+
}
|
|
1030
|
+
}
|
|
1031
|
+
|
|
1032
|
+
return out_rows, sidecar
|