brainlayer 1.0.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- brainlayer/__init__.py +3 -0
- brainlayer/cli/__init__.py +1545 -0
- brainlayer/cli/wizard.py +132 -0
- brainlayer/cli_new.py +151 -0
- brainlayer/client.py +164 -0
- brainlayer/clustering.py +736 -0
- brainlayer/daemon.py +1105 -0
- brainlayer/dashboard/README.md +129 -0
- brainlayer/dashboard/__init__.py +5 -0
- brainlayer/dashboard/app.py +151 -0
- brainlayer/dashboard/search.py +229 -0
- brainlayer/dashboard/views.py +230 -0
- brainlayer/embeddings.py +131 -0
- brainlayer/engine.py +550 -0
- brainlayer/index_new.py +87 -0
- brainlayer/mcp/__init__.py +1558 -0
- brainlayer/migrate.py +205 -0
- brainlayer/paths.py +43 -0
- brainlayer/pipeline/__init__.py +47 -0
- brainlayer/pipeline/analyze_communication.py +508 -0
- brainlayer/pipeline/brain_graph.py +567 -0
- brainlayer/pipeline/chat_tags.py +63 -0
- brainlayer/pipeline/chunk.py +422 -0
- brainlayer/pipeline/classify.py +472 -0
- brainlayer/pipeline/cluster_sampling.py +73 -0
- brainlayer/pipeline/enrichment.py +810 -0
- brainlayer/pipeline/extract.py +66 -0
- brainlayer/pipeline/extract_claude_desktop.py +149 -0
- brainlayer/pipeline/extract_corrections.py +231 -0
- brainlayer/pipeline/extract_markdown.py +195 -0
- brainlayer/pipeline/extract_whatsapp.py +227 -0
- brainlayer/pipeline/git_overlay.py +301 -0
- brainlayer/pipeline/longitudinal_analyzer.py +568 -0
- brainlayer/pipeline/obsidian_export.py +455 -0
- brainlayer/pipeline/operation_grouping.py +486 -0
- brainlayer/pipeline/plan_linking.py +313 -0
- brainlayer/pipeline/sanitize.py +549 -0
- brainlayer/pipeline/semantic_style.py +574 -0
- brainlayer/pipeline/session_enrichment.py +472 -0
- brainlayer/pipeline/style_embed.py +67 -0
- brainlayer/pipeline/style_index.py +139 -0
- brainlayer/pipeline/temporal_chains.py +203 -0
- brainlayer/pipeline/time_batcher.py +248 -0
- brainlayer/pipeline/unified_timeline.py +569 -0
- brainlayer/storage.py +66 -0
- brainlayer/store.py +155 -0
- brainlayer/taxonomy.json +80 -0
- brainlayer/vector_store.py +1891 -0
- brainlayer-1.0.0.dist-info/METADATA +313 -0
- brainlayer-1.0.0.dist-info/RECORD +53 -0
- brainlayer-1.0.0.dist-info/WHEEL +4 -0
- brainlayer-1.0.0.dist-info/entry_points.txt +4 -0
- brainlayer-1.0.0.dist-info/licenses/LICENSE +190 -0
|
@@ -0,0 +1,574 @@
|
|
|
1
|
+
"""Semantic Style Analysis - Topic-based style pattern extraction.
|
|
2
|
+
|
|
3
|
+
Uses bge-large embeddings to cluster messages by TOPIC (what you write about),
|
|
4
|
+
then analyzes STYLE within each topic cluster (how you write in that context).
|
|
5
|
+
|
|
6
|
+
This differs from style_embed.py which uses StyleDistance for pure style clustering.
|
|
7
|
+
Here we want: "When talking about technical topics, you write like THIS"
|
|
8
|
+
|
|
9
|
+
Key concepts:
|
|
10
|
+
- Topic clusters: technical, casual chat, professional, emotional
|
|
11
|
+
- Per-topic style metrics: formality, length, emoji, phrases
|
|
12
|
+
- Cross-context comparisons: "more formal when discussing work"
|
|
13
|
+
"""
|
|
14
|
+
|
|
15
|
+
from __future__ import annotations
|
|
16
|
+
|
|
17
|
+
import json
|
|
18
|
+
import logging
|
|
19
|
+
import re
|
|
20
|
+
from collections import Counter
|
|
21
|
+
from dataclasses import dataclass, field
|
|
22
|
+
from pathlib import Path
|
|
23
|
+
from typing import Any, Optional
|
|
24
|
+
|
|
25
|
+
logger = logging.getLogger(__name__)
|
|
26
|
+
|
|
27
|
+
try:
|
|
28
|
+
import numpy as np
|
|
29
|
+
|
|
30
|
+
HAS_NUMPY = True
|
|
31
|
+
except ImportError:
|
|
32
|
+
HAS_NUMPY = False
|
|
33
|
+
|
|
34
|
+
try:
|
|
35
|
+
from sentence_transformers import SentenceTransformer
|
|
36
|
+
|
|
37
|
+
HAS_SENTENCE_TRANSFORMERS = True
|
|
38
|
+
except ImportError:
|
|
39
|
+
HAS_SENTENCE_TRANSFORMERS = False
|
|
40
|
+
|
|
41
|
+
try:
|
|
42
|
+
from sklearn.metrics.pairwise import cosine_similarity
|
|
43
|
+
|
|
44
|
+
HAS_SKLEARN = True
|
|
45
|
+
except ImportError:
|
|
46
|
+
HAS_SKLEARN = False
|
|
47
|
+
|
|
48
|
+
|
|
49
|
+
# bge-large for semantic/topic clustering (1024 dims, good multilingual)
|
|
50
|
+
SEMANTIC_MODEL = "BAAI/bge-large-en-v1.5"
|
|
51
|
+
MAX_CHARS = 2000
|
|
52
|
+
|
|
53
|
+
# Predefined topic seeds for guided clustering (English + Hebrew)
|
|
54
|
+
TOPIC_SEEDS = {
|
|
55
|
+
"technical": [
|
|
56
|
+
"debugging the code",
|
|
57
|
+
"implementing the feature",
|
|
58
|
+
"API endpoint",
|
|
59
|
+
"database query",
|
|
60
|
+
"git commit",
|
|
61
|
+
"pull request review",
|
|
62
|
+
"לתקן את הבאג",
|
|
63
|
+
"לממש את הפיצ'ר",
|
|
64
|
+
"לעשות דיפלוי",
|
|
65
|
+
],
|
|
66
|
+
"casual": [
|
|
67
|
+
"haha that's funny",
|
|
68
|
+
"what are you doing",
|
|
69
|
+
"see you later",
|
|
70
|
+
"good morning",
|
|
71
|
+
"how was your day",
|
|
72
|
+
"מה קורה",
|
|
73
|
+
"מה נשמע",
|
|
74
|
+
"חחח",
|
|
75
|
+
"יאללה ביי",
|
|
76
|
+
"בוקר טוב",
|
|
77
|
+
],
|
|
78
|
+
"professional": [
|
|
79
|
+
"meeting scheduled",
|
|
80
|
+
"project deadline",
|
|
81
|
+
"quarterly review",
|
|
82
|
+
"client presentation",
|
|
83
|
+
"follow up on the proposal",
|
|
84
|
+
"פגישה נקבעה",
|
|
85
|
+
"דדליין של הפרויקט",
|
|
86
|
+
"לעקוב אחרי",
|
|
87
|
+
"לסגור את הנושא",
|
|
88
|
+
],
|
|
89
|
+
"emotional": [
|
|
90
|
+
"I'm so excited",
|
|
91
|
+
"that's frustrating",
|
|
92
|
+
"really happy about",
|
|
93
|
+
"worried about",
|
|
94
|
+
"love this",
|
|
95
|
+
"אני כל כך שמח",
|
|
96
|
+
"זה מתסכל",
|
|
97
|
+
"אוהב את זה",
|
|
98
|
+
"דואג לגבי",
|
|
99
|
+
],
|
|
100
|
+
"explanatory": [
|
|
101
|
+
"let me explain",
|
|
102
|
+
"the reason is",
|
|
103
|
+
"basically what happens is",
|
|
104
|
+
"think of it like",
|
|
105
|
+
"for example",
|
|
106
|
+
"תן לי להסביר",
|
|
107
|
+
"הסיבה היא",
|
|
108
|
+
"בעצם מה שקורה",
|
|
109
|
+
"לדוגמה",
|
|
110
|
+
],
|
|
111
|
+
}
|
|
112
|
+
|
|
113
|
+
|
|
114
|
+
@dataclass
|
|
115
|
+
class TopicCluster:
|
|
116
|
+
"""A cluster of messages grouped by topic."""
|
|
117
|
+
|
|
118
|
+
name: str
|
|
119
|
+
messages: list[str] = field(default_factory=list)
|
|
120
|
+
centroid: Optional[list[float]] = None
|
|
121
|
+
|
|
122
|
+
# Style metrics for this topic
|
|
123
|
+
message_count: int = 0
|
|
124
|
+
avg_length: float = 0.0
|
|
125
|
+
formality: float = 0.5
|
|
126
|
+
emoji_rate: float = 0.0
|
|
127
|
+
question_rate: float = 0.0
|
|
128
|
+
exclamation_rate: float = 0.0
|
|
129
|
+
common_phrases: list[str] = field(default_factory=list)
|
|
130
|
+
language_mix: dict[str, float] = field(default_factory=dict)
|
|
131
|
+
|
|
132
|
+
|
|
133
|
+
@dataclass
|
|
134
|
+
class SemanticStyleAnalysis:
|
|
135
|
+
"""Complete semantic style analysis result."""
|
|
136
|
+
|
|
137
|
+
topic_clusters: dict[str, TopicCluster] = field(default_factory=dict)
|
|
138
|
+
cross_topic_insights: list[str] = field(default_factory=list)
|
|
139
|
+
style_rules_markdown: str = ""
|
|
140
|
+
|
|
141
|
+
|
|
142
|
+
class SemanticStyleAnalyzer:
|
|
143
|
+
"""Analyze writing style patterns by topic/context."""
|
|
144
|
+
|
|
145
|
+
def __init__(self, model_name: str = SEMANTIC_MODEL):
|
|
146
|
+
if not HAS_NUMPY:
|
|
147
|
+
raise ImportError("numpy required. Install: pip install numpy")
|
|
148
|
+
if not HAS_SENTENCE_TRANSFORMERS:
|
|
149
|
+
raise ImportError("sentence-transformers required. Install: pip install sentence-transformers")
|
|
150
|
+
if not HAS_SKLEARN:
|
|
151
|
+
raise ImportError("scikit-learn required. Install: pip install scikit-learn")
|
|
152
|
+
self.model_name = model_name
|
|
153
|
+
self._model: Optional[SentenceTransformer] = None
|
|
154
|
+
self._topic_seed_embeddings: Optional[dict[str, np.ndarray]] = None
|
|
155
|
+
|
|
156
|
+
@property
|
|
157
|
+
def model(self) -> SentenceTransformer:
|
|
158
|
+
"""Lazy load the embedding model."""
|
|
159
|
+
if self._model is None:
|
|
160
|
+
logger.info("Loading %s...", self.model_name)
|
|
161
|
+
self._model = SentenceTransformer(self.model_name)
|
|
162
|
+
return self._model
|
|
163
|
+
|
|
164
|
+
def _get_topic_seed_embeddings(self) -> dict[str, np.ndarray]:
|
|
165
|
+
"""Get/compute embeddings for topic seed phrases."""
|
|
166
|
+
if self._topic_seed_embeddings is None:
|
|
167
|
+
self._topic_seed_embeddings = {}
|
|
168
|
+
for topic, seeds in TOPIC_SEEDS.items():
|
|
169
|
+
embeddings = self.model.encode(seeds, convert_to_numpy=True)
|
|
170
|
+
# Average the seed embeddings to get topic centroid
|
|
171
|
+
self._topic_seed_embeddings[topic] = np.mean(embeddings, axis=0)
|
|
172
|
+
return self._topic_seed_embeddings
|
|
173
|
+
|
|
174
|
+
def embed_messages(
|
|
175
|
+
self,
|
|
176
|
+
messages: list[str],
|
|
177
|
+
batch_size: int = 32,
|
|
178
|
+
show_progress: bool = True,
|
|
179
|
+
) -> np.ndarray:
|
|
180
|
+
"""Embed messages using bge-large for topic clustering."""
|
|
181
|
+
# Truncate long messages
|
|
182
|
+
truncated = [m[:MAX_CHARS] for m in messages]
|
|
183
|
+
|
|
184
|
+
embeddings = self.model.encode(
|
|
185
|
+
truncated,
|
|
186
|
+
batch_size=batch_size,
|
|
187
|
+
show_progress_bar=show_progress,
|
|
188
|
+
convert_to_numpy=True,
|
|
189
|
+
)
|
|
190
|
+
return embeddings
|
|
191
|
+
|
|
192
|
+
def assign_topics(
|
|
193
|
+
self,
|
|
194
|
+
messages: list[str],
|
|
195
|
+
embeddings: np.ndarray,
|
|
196
|
+
threshold: float = 0.3,
|
|
197
|
+
) -> dict[str, list[int]]:
|
|
198
|
+
"""Assign messages to topics based on similarity to seed centroids.
|
|
199
|
+
|
|
200
|
+
Args:
|
|
201
|
+
messages: List of message texts
|
|
202
|
+
embeddings: Message embeddings from embed_messages()
|
|
203
|
+
threshold: Minimum cosine similarity (0-1) to assign to a topic.
|
|
204
|
+
Messages below threshold go to "other".
|
|
205
|
+
|
|
206
|
+
Returns:
|
|
207
|
+
Dict mapping topic name to list of message indices
|
|
208
|
+
"""
|
|
209
|
+
topic_seeds = self._get_topic_seed_embeddings()
|
|
210
|
+
assignments: dict[str, list[int]] = {topic: [] for topic in topic_seeds}
|
|
211
|
+
assignments["other"] = []
|
|
212
|
+
|
|
213
|
+
for i, emb in enumerate(embeddings):
|
|
214
|
+
best_topic = "other"
|
|
215
|
+
best_sim = threshold
|
|
216
|
+
|
|
217
|
+
for topic, seed_emb in topic_seeds.items():
|
|
218
|
+
sim = cosine_similarity([emb], [seed_emb])[0][0]
|
|
219
|
+
if sim > best_sim:
|
|
220
|
+
best_sim = sim
|
|
221
|
+
best_topic = topic
|
|
222
|
+
|
|
223
|
+
assignments[best_topic].append(i)
|
|
224
|
+
|
|
225
|
+
return assignments
|
|
226
|
+
|
|
227
|
+
def analyze_cluster_style(self, messages: list[str]) -> dict[str, Any]:
|
|
228
|
+
"""Analyze style patterns within a cluster of messages."""
|
|
229
|
+
if not messages:
|
|
230
|
+
return {}
|
|
231
|
+
|
|
232
|
+
# Length analysis
|
|
233
|
+
lengths = [len(m) for m in messages]
|
|
234
|
+
avg_length = sum(lengths) / len(lengths)
|
|
235
|
+
|
|
236
|
+
# Formality indicators
|
|
237
|
+
informal_markers = [
|
|
238
|
+
r"\blol\b",
|
|
239
|
+
r"\bhaha\b",
|
|
240
|
+
r"\bחח\b",
|
|
241
|
+
r"\bomg\b",
|
|
242
|
+
r"\bbtw\b",
|
|
243
|
+
r"\bכן\b",
|
|
244
|
+
r"\bלא\b",
|
|
245
|
+
r"\bוואלה\b",
|
|
246
|
+
r"\bסבבה\b",
|
|
247
|
+
r"\bיאללה\b",
|
|
248
|
+
r"!!+",
|
|
249
|
+
r"\?\?+",
|
|
250
|
+
r"\.\.\.+",
|
|
251
|
+
]
|
|
252
|
+
formal_markers = [
|
|
253
|
+
r"\bplease\b",
|
|
254
|
+
r"\bkindly\b",
|
|
255
|
+
r"\bregards\b",
|
|
256
|
+
r"\bthank you\b",
|
|
257
|
+
r"\bבבקשה\b",
|
|
258
|
+
r"\bתודה\b",
|
|
259
|
+
r"\bלהלן\b",
|
|
260
|
+
]
|
|
261
|
+
|
|
262
|
+
informal_count = 0
|
|
263
|
+
formal_count = 0
|
|
264
|
+
for msg in messages:
|
|
265
|
+
msg_lower = msg.lower()
|
|
266
|
+
informal_count += sum(1 for p in informal_markers if re.search(p, msg_lower))
|
|
267
|
+
formal_count += sum(1 for p in formal_markers if re.search(p, msg_lower))
|
|
268
|
+
|
|
269
|
+
informal_ratio = min(1.0, informal_count / max(len(messages), 1))
|
|
270
|
+
formal_ratio = min(1.0, formal_count / max(len(messages), 1))
|
|
271
|
+
formality = 0.5 - (informal_ratio * 0.3) + (formal_ratio * 0.3)
|
|
272
|
+
formality = max(0.1, min(0.9, formality))
|
|
273
|
+
|
|
274
|
+
# Emoji rate
|
|
275
|
+
emoji_pattern = re.compile(
|
|
276
|
+
"["
|
|
277
|
+
"\U0001f600-\U0001f64f" # emoticons
|
|
278
|
+
"\U0001f300-\U0001f5ff" # symbols & pictographs
|
|
279
|
+
"\U0001f680-\U0001f6ff" # transport & map symbols
|
|
280
|
+
"\U0001f1e0-\U0001f1ff" # flags
|
|
281
|
+
"]+",
|
|
282
|
+
flags=re.UNICODE,
|
|
283
|
+
)
|
|
284
|
+
emoji_count = sum(len(emoji_pattern.findall(m)) for m in messages)
|
|
285
|
+
emoji_rate = emoji_count / len(messages)
|
|
286
|
+
|
|
287
|
+
# Punctuation rates
|
|
288
|
+
question_count = sum(m.count("?") for m in messages)
|
|
289
|
+
exclamation_count = sum(m.count("!") for m in messages)
|
|
290
|
+
question_rate = question_count / len(messages)
|
|
291
|
+
exclamation_rate = exclamation_count / len(messages)
|
|
292
|
+
|
|
293
|
+
# Language detection (simple Hebrew/English check)
|
|
294
|
+
hebrew_pattern = re.compile(r"[\u0590-\u05FF]")
|
|
295
|
+
english_count = 0
|
|
296
|
+
hebrew_count = 0
|
|
297
|
+
for msg in messages:
|
|
298
|
+
has_hebrew = bool(hebrew_pattern.search(msg))
|
|
299
|
+
has_english = bool(re.search(r"[a-zA-Z]", msg))
|
|
300
|
+
if has_hebrew:
|
|
301
|
+
hebrew_count += 1
|
|
302
|
+
if has_english:
|
|
303
|
+
english_count += 1
|
|
304
|
+
|
|
305
|
+
language_mix = {
|
|
306
|
+
"hebrew": hebrew_count / len(messages),
|
|
307
|
+
"english": english_count / len(messages),
|
|
308
|
+
}
|
|
309
|
+
|
|
310
|
+
# Common phrases (bigrams and trigrams)
|
|
311
|
+
words = []
|
|
312
|
+
for msg in messages:
|
|
313
|
+
words.extend(re.findall(r"\b\w+\b", msg.lower()))
|
|
314
|
+
|
|
315
|
+
# Guard against short word lists
|
|
316
|
+
bigrams = [f"{words[i]} {words[i + 1]}" for i in range(len(words) - 1)] if len(words) >= 2 else []
|
|
317
|
+
trigrams = (
|
|
318
|
+
[f"{words[i]} {words[i + 1]} {words[i + 2]}" for i in range(len(words) - 2)] if len(words) >= 3 else []
|
|
319
|
+
)
|
|
320
|
+
|
|
321
|
+
phrase_counts = Counter(bigrams + trigrams)
|
|
322
|
+
# Filter to meaningful phrases (appear 3+ times, not just stopwords)
|
|
323
|
+
min_phrase_count = 3
|
|
324
|
+
common_phrases = [
|
|
325
|
+
phrase
|
|
326
|
+
for phrase, count in phrase_counts.most_common(20)
|
|
327
|
+
if count >= min_phrase_count and len(phrase.split()) > 1
|
|
328
|
+
][:10]
|
|
329
|
+
|
|
330
|
+
return {
|
|
331
|
+
"avg_length": avg_length,
|
|
332
|
+
"formality": formality,
|
|
333
|
+
"emoji_rate": emoji_rate,
|
|
334
|
+
"question_rate": question_rate,
|
|
335
|
+
"exclamation_rate": exclamation_rate,
|
|
336
|
+
"language_mix": language_mix,
|
|
337
|
+
"common_phrases": common_phrases,
|
|
338
|
+
"message_count": len(messages),
|
|
339
|
+
}
|
|
340
|
+
|
|
341
|
+
def analyze(
|
|
342
|
+
self,
|
|
343
|
+
messages: list[str],
|
|
344
|
+
min_cluster_size: int = 10,
|
|
345
|
+
) -> SemanticStyleAnalysis:
|
|
346
|
+
"""Run full semantic style analysis.
|
|
347
|
+
|
|
348
|
+
Args:
|
|
349
|
+
messages: List of message texts to analyze
|
|
350
|
+
min_cluster_size: Minimum messages for a topic to be included
|
|
351
|
+
|
|
352
|
+
Returns:
|
|
353
|
+
SemanticStyleAnalysis with topic clusters and insights
|
|
354
|
+
"""
|
|
355
|
+
logger.info("Analyzing %d messages...", len(messages))
|
|
356
|
+
|
|
357
|
+
# Embed all messages
|
|
358
|
+
logger.info("Computing embeddings...")
|
|
359
|
+
embeddings = self.embed_messages(messages)
|
|
360
|
+
|
|
361
|
+
# Assign to topics
|
|
362
|
+
logger.info("Assigning topics...")
|
|
363
|
+
topic_assignments = self.assign_topics(messages, embeddings)
|
|
364
|
+
|
|
365
|
+
# Analyze each topic cluster
|
|
366
|
+
logger.info("Analyzing topic clusters...")
|
|
367
|
+
topic_clusters: dict[str, TopicCluster] = {}
|
|
368
|
+
|
|
369
|
+
for topic, indices in topic_assignments.items():
|
|
370
|
+
if len(indices) < min_cluster_size:
|
|
371
|
+
continue
|
|
372
|
+
|
|
373
|
+
cluster_messages = [messages[i] for i in indices]
|
|
374
|
+
style = self.analyze_cluster_style(cluster_messages)
|
|
375
|
+
|
|
376
|
+
cluster = TopicCluster(
|
|
377
|
+
name=topic,
|
|
378
|
+
messages=cluster_messages[:100], # Keep sample for reference
|
|
379
|
+
message_count=style.get("message_count", len(cluster_messages)),
|
|
380
|
+
avg_length=style.get("avg_length", 0),
|
|
381
|
+
formality=style.get("formality", 0.5),
|
|
382
|
+
emoji_rate=style.get("emoji_rate", 0),
|
|
383
|
+
question_rate=style.get("question_rate", 0),
|
|
384
|
+
exclamation_rate=style.get("exclamation_rate", 0),
|
|
385
|
+
common_phrases=style.get("common_phrases", []),
|
|
386
|
+
language_mix=style.get("language_mix", {}),
|
|
387
|
+
)
|
|
388
|
+
topic_clusters[topic] = cluster
|
|
389
|
+
logger.info(" %s: %d messages, formality=%.2f", topic, len(indices), cluster.formality)
|
|
390
|
+
|
|
391
|
+
# Generate cross-topic insights
|
|
392
|
+
insights = self._generate_insights(topic_clusters)
|
|
393
|
+
|
|
394
|
+
# Generate markdown rules
|
|
395
|
+
markdown = self._generate_markdown(topic_clusters, insights)
|
|
396
|
+
|
|
397
|
+
return SemanticStyleAnalysis(
|
|
398
|
+
topic_clusters=topic_clusters,
|
|
399
|
+
cross_topic_insights=insights,
|
|
400
|
+
style_rules_markdown=markdown,
|
|
401
|
+
)
|
|
402
|
+
|
|
403
|
+
def _generate_insights(self, clusters: dict[str, TopicCluster]) -> list[str]:
|
|
404
|
+
"""Generate cross-topic insights by comparing clusters."""
|
|
405
|
+
insights = []
|
|
406
|
+
|
|
407
|
+
if len(clusters) < 2:
|
|
408
|
+
return insights
|
|
409
|
+
|
|
410
|
+
# Find most/least formal topic
|
|
411
|
+
formalities = [(t, c.formality) for t, c in clusters.items()]
|
|
412
|
+
formalities.sort(key=lambda x: x[1])
|
|
413
|
+
|
|
414
|
+
if len(formalities) >= 2:
|
|
415
|
+
most_casual = formalities[0]
|
|
416
|
+
most_formal = formalities[-1]
|
|
417
|
+
if most_formal[1] - most_casual[1] > 0.1:
|
|
418
|
+
insights.append(
|
|
419
|
+
f"Most formal in '{most_formal[0]}' contexts ({most_formal[1]:.2f}), "
|
|
420
|
+
f"most casual in '{most_casual[0]}' ({most_casual[1]:.2f})"
|
|
421
|
+
)
|
|
422
|
+
|
|
423
|
+
# Find where you use most emoji
|
|
424
|
+
emoji_rates = [(t, c.emoji_rate) for t, c in clusters.items()]
|
|
425
|
+
emoji_rates.sort(key=lambda x: x[1], reverse=True)
|
|
426
|
+
if emoji_rates[0][1] > 0.1:
|
|
427
|
+
insights.append(f"Uses most emoji in '{emoji_rates[0][0]}' contexts ({emoji_rates[0][1]:.2f} per message)")
|
|
428
|
+
|
|
429
|
+
# Language switching patterns
|
|
430
|
+
for topic, cluster in clusters.items():
|
|
431
|
+
hebrew = cluster.language_mix.get("hebrew", 0)
|
|
432
|
+
english = cluster.language_mix.get("english", 0)
|
|
433
|
+
if hebrew > 0.5 and english > 0.5:
|
|
434
|
+
insights.append(f"Frequently code-switches Hebrew/English in '{topic}' context")
|
|
435
|
+
elif hebrew > 0.8:
|
|
436
|
+
insights.append(f"Primarily Hebrew in '{topic}' context")
|
|
437
|
+
|
|
438
|
+
# Message length patterns
|
|
439
|
+
lengths = [(t, c.avg_length) for t, c in clusters.items()]
|
|
440
|
+
lengths.sort(key=lambda x: x[1])
|
|
441
|
+
if lengths[-1][1] > lengths[0][1] * 2:
|
|
442
|
+
insights.append(
|
|
443
|
+
f"Writes longest messages in '{lengths[-1][0]}' ({lengths[-1][1]:.0f} chars avg), "
|
|
444
|
+
f"shortest in '{lengths[0][0]}' ({lengths[0][1]:.0f} chars)"
|
|
445
|
+
)
|
|
446
|
+
|
|
447
|
+
return insights
|
|
448
|
+
|
|
449
|
+
def _generate_markdown(
|
|
450
|
+
self,
|
|
451
|
+
clusters: dict[str, TopicCluster],
|
|
452
|
+
insights: list[str],
|
|
453
|
+
) -> str:
|
|
454
|
+
"""Generate markdown style rules from analysis."""
|
|
455
|
+
lines = [
|
|
456
|
+
"# Your Writing Style (Semantic Analysis)",
|
|
457
|
+
"",
|
|
458
|
+
"Generated from message clustering by topic.",
|
|
459
|
+
"",
|
|
460
|
+
"## Cross-Context Insights",
|
|
461
|
+
"",
|
|
462
|
+
]
|
|
463
|
+
|
|
464
|
+
for insight in insights:
|
|
465
|
+
lines.append(f"- {insight}")
|
|
466
|
+
|
|
467
|
+
lines.extend(["", "## By Context", ""])
|
|
468
|
+
|
|
469
|
+
for topic, cluster in clusters.items():
|
|
470
|
+
lines.append(f"### {topic.title()}")
|
|
471
|
+
lines.append("")
|
|
472
|
+
lines.append(f"- **Message count:** {cluster.message_count}")
|
|
473
|
+
lines.append(f"- **Average length:** {cluster.avg_length:.0f} characters")
|
|
474
|
+
lines.append(f"- **Formality:** {cluster.formality:.2f} (0=casual, 1=formal)")
|
|
475
|
+
lines.append(f"- **Emoji rate:** {cluster.emoji_rate:.2f} per message")
|
|
476
|
+
|
|
477
|
+
if cluster.language_mix:
|
|
478
|
+
lang_str = ", ".join(f"{lang}: {pct:.0%}" for lang, pct in cluster.language_mix.items())
|
|
479
|
+
lines.append(f"- **Language mix:** {lang_str}")
|
|
480
|
+
|
|
481
|
+
if cluster.common_phrases:
|
|
482
|
+
phrases = ", ".join(f'"{p}"' for p in cluster.common_phrases[:5])
|
|
483
|
+
lines.append(f"- **Common phrases:** {phrases}")
|
|
484
|
+
|
|
485
|
+
lines.append("")
|
|
486
|
+
|
|
487
|
+
lines.extend(
|
|
488
|
+
[
|
|
489
|
+
"## For Cover Letters & Professional Outreach",
|
|
490
|
+
"",
|
|
491
|
+
"Based on your patterns:",
|
|
492
|
+
"",
|
|
493
|
+
]
|
|
494
|
+
)
|
|
495
|
+
|
|
496
|
+
# Add recommendations based on analysis
|
|
497
|
+
if "professional" in clusters:
|
|
498
|
+
prof = clusters["professional"]
|
|
499
|
+
lines.append(f"- Use formality level ~{prof.formality:.2f} (matches your work context)")
|
|
500
|
+
|
|
501
|
+
if "technical" in clusters:
|
|
502
|
+
tech = clusters["technical"]
|
|
503
|
+
lines.append(f"- Technical explanations avg {tech.avg_length:.0f} chars - keep similar length")
|
|
504
|
+
if tech.common_phrases:
|
|
505
|
+
lines.append(f"- You naturally use phrases like: {', '.join(tech.common_phrases[:3])}")
|
|
506
|
+
|
|
507
|
+
if "emotional" in clusters:
|
|
508
|
+
emo = clusters["emotional"]
|
|
509
|
+
if emo.exclamation_rate > 0.3:
|
|
510
|
+
lines.append("- You express enthusiasm with exclamation marks - use sparingly in professional context")
|
|
511
|
+
|
|
512
|
+
lines.append("- Avoid Hebrew in English-only professional contexts")
|
|
513
|
+
lines.append("")
|
|
514
|
+
|
|
515
|
+
return "\n".join(lines)
|
|
516
|
+
|
|
517
|
+
def save_analysis(
|
|
518
|
+
self,
|
|
519
|
+
analysis: SemanticStyleAnalysis,
|
|
520
|
+
output_dir: Path,
|
|
521
|
+
) -> None:
|
|
522
|
+
"""Save analysis results to files."""
|
|
523
|
+
output_dir.mkdir(parents=True, exist_ok=True)
|
|
524
|
+
|
|
525
|
+
# Save markdown rules
|
|
526
|
+
rules_path = output_dir / "semantic-style-rules.md"
|
|
527
|
+
rules_path.write_text(analysis.style_rules_markdown)
|
|
528
|
+
logger.info("Saved rules to %s", rules_path)
|
|
529
|
+
|
|
530
|
+
# Save JSON data for programmatic use
|
|
531
|
+
data = {
|
|
532
|
+
"topics": {
|
|
533
|
+
name: {
|
|
534
|
+
"message_count": cluster.message_count,
|
|
535
|
+
"avg_length": cluster.avg_length,
|
|
536
|
+
"formality": cluster.formality,
|
|
537
|
+
"emoji_rate": cluster.emoji_rate,
|
|
538
|
+
"question_rate": cluster.question_rate,
|
|
539
|
+
"exclamation_rate": cluster.exclamation_rate,
|
|
540
|
+
"language_mix": cluster.language_mix,
|
|
541
|
+
"common_phrases": cluster.common_phrases,
|
|
542
|
+
}
|
|
543
|
+
for name, cluster in analysis.topic_clusters.items()
|
|
544
|
+
},
|
|
545
|
+
"insights": analysis.cross_topic_insights,
|
|
546
|
+
}
|
|
547
|
+
|
|
548
|
+
json_path = output_dir / "semantic-style-data.json"
|
|
549
|
+
json_path.write_text(json.dumps(data, indent=2, ensure_ascii=False))
|
|
550
|
+
logger.info("Saved data to %s", json_path)
|
|
551
|
+
|
|
552
|
+
|
|
553
|
+
def analyze_semantic_style(
|
|
554
|
+
messages: list[str],
|
|
555
|
+
output_dir: Optional[Path] = None,
|
|
556
|
+
min_cluster_size: int = 10,
|
|
557
|
+
) -> SemanticStyleAnalysis:
|
|
558
|
+
"""Convenience function for semantic style analysis.
|
|
559
|
+
|
|
560
|
+
Args:
|
|
561
|
+
messages: List of message texts
|
|
562
|
+
output_dir: Optional directory to save results
|
|
563
|
+
min_cluster_size: Minimum messages per topic cluster
|
|
564
|
+
|
|
565
|
+
Returns:
|
|
566
|
+
SemanticStyleAnalysis result
|
|
567
|
+
"""
|
|
568
|
+
analyzer = SemanticStyleAnalyzer()
|
|
569
|
+
analysis = analyzer.analyze(messages, min_cluster_size=min_cluster_size)
|
|
570
|
+
|
|
571
|
+
if output_dir:
|
|
572
|
+
analyzer.save_analysis(analysis, output_dir)
|
|
573
|
+
|
|
574
|
+
return analysis
|