brainlayer 1.0.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (53) hide show
  1. brainlayer/__init__.py +3 -0
  2. brainlayer/cli/__init__.py +1545 -0
  3. brainlayer/cli/wizard.py +132 -0
  4. brainlayer/cli_new.py +151 -0
  5. brainlayer/client.py +164 -0
  6. brainlayer/clustering.py +736 -0
  7. brainlayer/daemon.py +1105 -0
  8. brainlayer/dashboard/README.md +129 -0
  9. brainlayer/dashboard/__init__.py +5 -0
  10. brainlayer/dashboard/app.py +151 -0
  11. brainlayer/dashboard/search.py +229 -0
  12. brainlayer/dashboard/views.py +230 -0
  13. brainlayer/embeddings.py +131 -0
  14. brainlayer/engine.py +550 -0
  15. brainlayer/index_new.py +87 -0
  16. brainlayer/mcp/__init__.py +1558 -0
  17. brainlayer/migrate.py +205 -0
  18. brainlayer/paths.py +43 -0
  19. brainlayer/pipeline/__init__.py +47 -0
  20. brainlayer/pipeline/analyze_communication.py +508 -0
  21. brainlayer/pipeline/brain_graph.py +567 -0
  22. brainlayer/pipeline/chat_tags.py +63 -0
  23. brainlayer/pipeline/chunk.py +422 -0
  24. brainlayer/pipeline/classify.py +472 -0
  25. brainlayer/pipeline/cluster_sampling.py +73 -0
  26. brainlayer/pipeline/enrichment.py +810 -0
  27. brainlayer/pipeline/extract.py +66 -0
  28. brainlayer/pipeline/extract_claude_desktop.py +149 -0
  29. brainlayer/pipeline/extract_corrections.py +231 -0
  30. brainlayer/pipeline/extract_markdown.py +195 -0
  31. brainlayer/pipeline/extract_whatsapp.py +227 -0
  32. brainlayer/pipeline/git_overlay.py +301 -0
  33. brainlayer/pipeline/longitudinal_analyzer.py +568 -0
  34. brainlayer/pipeline/obsidian_export.py +455 -0
  35. brainlayer/pipeline/operation_grouping.py +486 -0
  36. brainlayer/pipeline/plan_linking.py +313 -0
  37. brainlayer/pipeline/sanitize.py +549 -0
  38. brainlayer/pipeline/semantic_style.py +574 -0
  39. brainlayer/pipeline/session_enrichment.py +472 -0
  40. brainlayer/pipeline/style_embed.py +67 -0
  41. brainlayer/pipeline/style_index.py +139 -0
  42. brainlayer/pipeline/temporal_chains.py +203 -0
  43. brainlayer/pipeline/time_batcher.py +248 -0
  44. brainlayer/pipeline/unified_timeline.py +569 -0
  45. brainlayer/storage.py +66 -0
  46. brainlayer/store.py +155 -0
  47. brainlayer/taxonomy.json +80 -0
  48. brainlayer/vector_store.py +1891 -0
  49. brainlayer-1.0.0.dist-info/METADATA +313 -0
  50. brainlayer-1.0.0.dist-info/RECORD +53 -0
  51. brainlayer-1.0.0.dist-info/WHEEL +4 -0
  52. brainlayer-1.0.0.dist-info/entry_points.txt +4 -0
  53. brainlayer-1.0.0.dist-info/licenses/LICENSE +190 -0
@@ -0,0 +1,574 @@
1
+ """Semantic Style Analysis - Topic-based style pattern extraction.
2
+
3
+ Uses bge-large embeddings to cluster messages by TOPIC (what you write about),
4
+ then analyzes STYLE within each topic cluster (how you write in that context).
5
+
6
+ This differs from style_embed.py which uses StyleDistance for pure style clustering.
7
+ Here we want: "When talking about technical topics, you write like THIS"
8
+
9
+ Key concepts:
10
+ - Topic clusters: technical, casual chat, professional, emotional
11
+ - Per-topic style metrics: formality, length, emoji, phrases
12
+ - Cross-context comparisons: "more formal when discussing work"
13
+ """
14
+
15
+ from __future__ import annotations
16
+
17
+ import json
18
+ import logging
19
+ import re
20
+ from collections import Counter
21
+ from dataclasses import dataclass, field
22
+ from pathlib import Path
23
+ from typing import Any, Optional
24
+
25
+ logger = logging.getLogger(__name__)
26
+
27
+ try:
28
+ import numpy as np
29
+
30
+ HAS_NUMPY = True
31
+ except ImportError:
32
+ HAS_NUMPY = False
33
+
34
+ try:
35
+ from sentence_transformers import SentenceTransformer
36
+
37
+ HAS_SENTENCE_TRANSFORMERS = True
38
+ except ImportError:
39
+ HAS_SENTENCE_TRANSFORMERS = False
40
+
41
+ try:
42
+ from sklearn.metrics.pairwise import cosine_similarity
43
+
44
+ HAS_SKLEARN = True
45
+ except ImportError:
46
+ HAS_SKLEARN = False
47
+
48
+
49
+ # bge-large for semantic/topic clustering (1024 dims, good multilingual)
50
+ SEMANTIC_MODEL = "BAAI/bge-large-en-v1.5"
51
+ MAX_CHARS = 2000
52
+
53
+ # Predefined topic seeds for guided clustering (English + Hebrew)
54
+ TOPIC_SEEDS = {
55
+ "technical": [
56
+ "debugging the code",
57
+ "implementing the feature",
58
+ "API endpoint",
59
+ "database query",
60
+ "git commit",
61
+ "pull request review",
62
+ "לתקן את הבאג",
63
+ "לממש את הפיצ'ר",
64
+ "לעשות דיפלוי",
65
+ ],
66
+ "casual": [
67
+ "haha that's funny",
68
+ "what are you doing",
69
+ "see you later",
70
+ "good morning",
71
+ "how was your day",
72
+ "מה קורה",
73
+ "מה נשמע",
74
+ "חחח",
75
+ "יאללה ביי",
76
+ "בוקר טוב",
77
+ ],
78
+ "professional": [
79
+ "meeting scheduled",
80
+ "project deadline",
81
+ "quarterly review",
82
+ "client presentation",
83
+ "follow up on the proposal",
84
+ "פגישה נקבעה",
85
+ "דדליין של הפרויקט",
86
+ "לעקוב אחרי",
87
+ "לסגור את הנושא",
88
+ ],
89
+ "emotional": [
90
+ "I'm so excited",
91
+ "that's frustrating",
92
+ "really happy about",
93
+ "worried about",
94
+ "love this",
95
+ "אני כל כך שמח",
96
+ "זה מתסכל",
97
+ "אוהב את זה",
98
+ "דואג לגבי",
99
+ ],
100
+ "explanatory": [
101
+ "let me explain",
102
+ "the reason is",
103
+ "basically what happens is",
104
+ "think of it like",
105
+ "for example",
106
+ "תן לי להסביר",
107
+ "הסיבה היא",
108
+ "בעצם מה שקורה",
109
+ "לדוגמה",
110
+ ],
111
+ }
112
+
113
+
114
+ @dataclass
115
+ class TopicCluster:
116
+ """A cluster of messages grouped by topic."""
117
+
118
+ name: str
119
+ messages: list[str] = field(default_factory=list)
120
+ centroid: Optional[list[float]] = None
121
+
122
+ # Style metrics for this topic
123
+ message_count: int = 0
124
+ avg_length: float = 0.0
125
+ formality: float = 0.5
126
+ emoji_rate: float = 0.0
127
+ question_rate: float = 0.0
128
+ exclamation_rate: float = 0.0
129
+ common_phrases: list[str] = field(default_factory=list)
130
+ language_mix: dict[str, float] = field(default_factory=dict)
131
+
132
+
133
+ @dataclass
134
+ class SemanticStyleAnalysis:
135
+ """Complete semantic style analysis result."""
136
+
137
+ topic_clusters: dict[str, TopicCluster] = field(default_factory=dict)
138
+ cross_topic_insights: list[str] = field(default_factory=list)
139
+ style_rules_markdown: str = ""
140
+
141
+
142
+ class SemanticStyleAnalyzer:
143
+ """Analyze writing style patterns by topic/context."""
144
+
145
+ def __init__(self, model_name: str = SEMANTIC_MODEL):
146
+ if not HAS_NUMPY:
147
+ raise ImportError("numpy required. Install: pip install numpy")
148
+ if not HAS_SENTENCE_TRANSFORMERS:
149
+ raise ImportError("sentence-transformers required. Install: pip install sentence-transformers")
150
+ if not HAS_SKLEARN:
151
+ raise ImportError("scikit-learn required. Install: pip install scikit-learn")
152
+ self.model_name = model_name
153
+ self._model: Optional[SentenceTransformer] = None
154
+ self._topic_seed_embeddings: Optional[dict[str, np.ndarray]] = None
155
+
156
+ @property
157
+ def model(self) -> SentenceTransformer:
158
+ """Lazy load the embedding model."""
159
+ if self._model is None:
160
+ logger.info("Loading %s...", self.model_name)
161
+ self._model = SentenceTransformer(self.model_name)
162
+ return self._model
163
+
164
+ def _get_topic_seed_embeddings(self) -> dict[str, np.ndarray]:
165
+ """Get/compute embeddings for topic seed phrases."""
166
+ if self._topic_seed_embeddings is None:
167
+ self._topic_seed_embeddings = {}
168
+ for topic, seeds in TOPIC_SEEDS.items():
169
+ embeddings = self.model.encode(seeds, convert_to_numpy=True)
170
+ # Average the seed embeddings to get topic centroid
171
+ self._topic_seed_embeddings[topic] = np.mean(embeddings, axis=0)
172
+ return self._topic_seed_embeddings
173
+
174
+ def embed_messages(
175
+ self,
176
+ messages: list[str],
177
+ batch_size: int = 32,
178
+ show_progress: bool = True,
179
+ ) -> np.ndarray:
180
+ """Embed messages using bge-large for topic clustering."""
181
+ # Truncate long messages
182
+ truncated = [m[:MAX_CHARS] for m in messages]
183
+
184
+ embeddings = self.model.encode(
185
+ truncated,
186
+ batch_size=batch_size,
187
+ show_progress_bar=show_progress,
188
+ convert_to_numpy=True,
189
+ )
190
+ return embeddings
191
+
192
+ def assign_topics(
193
+ self,
194
+ messages: list[str],
195
+ embeddings: np.ndarray,
196
+ threshold: float = 0.3,
197
+ ) -> dict[str, list[int]]:
198
+ """Assign messages to topics based on similarity to seed centroids.
199
+
200
+ Args:
201
+ messages: List of message texts
202
+ embeddings: Message embeddings from embed_messages()
203
+ threshold: Minimum cosine similarity (0-1) to assign to a topic.
204
+ Messages below threshold go to "other".
205
+
206
+ Returns:
207
+ Dict mapping topic name to list of message indices
208
+ """
209
+ topic_seeds = self._get_topic_seed_embeddings()
210
+ assignments: dict[str, list[int]] = {topic: [] for topic in topic_seeds}
211
+ assignments["other"] = []
212
+
213
+ for i, emb in enumerate(embeddings):
214
+ best_topic = "other"
215
+ best_sim = threshold
216
+
217
+ for topic, seed_emb in topic_seeds.items():
218
+ sim = cosine_similarity([emb], [seed_emb])[0][0]
219
+ if sim > best_sim:
220
+ best_sim = sim
221
+ best_topic = topic
222
+
223
+ assignments[best_topic].append(i)
224
+
225
+ return assignments
226
+
227
+ def analyze_cluster_style(self, messages: list[str]) -> dict[str, Any]:
228
+ """Analyze style patterns within a cluster of messages."""
229
+ if not messages:
230
+ return {}
231
+
232
+ # Length analysis
233
+ lengths = [len(m) for m in messages]
234
+ avg_length = sum(lengths) / len(lengths)
235
+
236
+ # Formality indicators
237
+ informal_markers = [
238
+ r"\blol\b",
239
+ r"\bhaha\b",
240
+ r"\bחח\b",
241
+ r"\bomg\b",
242
+ r"\bbtw\b",
243
+ r"\bכן\b",
244
+ r"\bלא\b",
245
+ r"\bוואלה\b",
246
+ r"\bסבבה\b",
247
+ r"\bיאללה\b",
248
+ r"!!+",
249
+ r"\?\?+",
250
+ r"\.\.\.+",
251
+ ]
252
+ formal_markers = [
253
+ r"\bplease\b",
254
+ r"\bkindly\b",
255
+ r"\bregards\b",
256
+ r"\bthank you\b",
257
+ r"\bבבקשה\b",
258
+ r"\bתודה\b",
259
+ r"\bלהלן\b",
260
+ ]
261
+
262
+ informal_count = 0
263
+ formal_count = 0
264
+ for msg in messages:
265
+ msg_lower = msg.lower()
266
+ informal_count += sum(1 for p in informal_markers if re.search(p, msg_lower))
267
+ formal_count += sum(1 for p in formal_markers if re.search(p, msg_lower))
268
+
269
+ informal_ratio = min(1.0, informal_count / max(len(messages), 1))
270
+ formal_ratio = min(1.0, formal_count / max(len(messages), 1))
271
+ formality = 0.5 - (informal_ratio * 0.3) + (formal_ratio * 0.3)
272
+ formality = max(0.1, min(0.9, formality))
273
+
274
+ # Emoji rate
275
+ emoji_pattern = re.compile(
276
+ "["
277
+ "\U0001f600-\U0001f64f" # emoticons
278
+ "\U0001f300-\U0001f5ff" # symbols & pictographs
279
+ "\U0001f680-\U0001f6ff" # transport & map symbols
280
+ "\U0001f1e0-\U0001f1ff" # flags
281
+ "]+",
282
+ flags=re.UNICODE,
283
+ )
284
+ emoji_count = sum(len(emoji_pattern.findall(m)) for m in messages)
285
+ emoji_rate = emoji_count / len(messages)
286
+
287
+ # Punctuation rates
288
+ question_count = sum(m.count("?") for m in messages)
289
+ exclamation_count = sum(m.count("!") for m in messages)
290
+ question_rate = question_count / len(messages)
291
+ exclamation_rate = exclamation_count / len(messages)
292
+
293
+ # Language detection (simple Hebrew/English check)
294
+ hebrew_pattern = re.compile(r"[\u0590-\u05FF]")
295
+ english_count = 0
296
+ hebrew_count = 0
297
+ for msg in messages:
298
+ has_hebrew = bool(hebrew_pattern.search(msg))
299
+ has_english = bool(re.search(r"[a-zA-Z]", msg))
300
+ if has_hebrew:
301
+ hebrew_count += 1
302
+ if has_english:
303
+ english_count += 1
304
+
305
+ language_mix = {
306
+ "hebrew": hebrew_count / len(messages),
307
+ "english": english_count / len(messages),
308
+ }
309
+
310
+ # Common phrases (bigrams and trigrams)
311
+ words = []
312
+ for msg in messages:
313
+ words.extend(re.findall(r"\b\w+\b", msg.lower()))
314
+
315
+ # Guard against short word lists
316
+ bigrams = [f"{words[i]} {words[i + 1]}" for i in range(len(words) - 1)] if len(words) >= 2 else []
317
+ trigrams = (
318
+ [f"{words[i]} {words[i + 1]} {words[i + 2]}" for i in range(len(words) - 2)] if len(words) >= 3 else []
319
+ )
320
+
321
+ phrase_counts = Counter(bigrams + trigrams)
322
+ # Filter to meaningful phrases (appear 3+ times, not just stopwords)
323
+ min_phrase_count = 3
324
+ common_phrases = [
325
+ phrase
326
+ for phrase, count in phrase_counts.most_common(20)
327
+ if count >= min_phrase_count and len(phrase.split()) > 1
328
+ ][:10]
329
+
330
+ return {
331
+ "avg_length": avg_length,
332
+ "formality": formality,
333
+ "emoji_rate": emoji_rate,
334
+ "question_rate": question_rate,
335
+ "exclamation_rate": exclamation_rate,
336
+ "language_mix": language_mix,
337
+ "common_phrases": common_phrases,
338
+ "message_count": len(messages),
339
+ }
340
+
341
+ def analyze(
342
+ self,
343
+ messages: list[str],
344
+ min_cluster_size: int = 10,
345
+ ) -> SemanticStyleAnalysis:
346
+ """Run full semantic style analysis.
347
+
348
+ Args:
349
+ messages: List of message texts to analyze
350
+ min_cluster_size: Minimum messages for a topic to be included
351
+
352
+ Returns:
353
+ SemanticStyleAnalysis with topic clusters and insights
354
+ """
355
+ logger.info("Analyzing %d messages...", len(messages))
356
+
357
+ # Embed all messages
358
+ logger.info("Computing embeddings...")
359
+ embeddings = self.embed_messages(messages)
360
+
361
+ # Assign to topics
362
+ logger.info("Assigning topics...")
363
+ topic_assignments = self.assign_topics(messages, embeddings)
364
+
365
+ # Analyze each topic cluster
366
+ logger.info("Analyzing topic clusters...")
367
+ topic_clusters: dict[str, TopicCluster] = {}
368
+
369
+ for topic, indices in topic_assignments.items():
370
+ if len(indices) < min_cluster_size:
371
+ continue
372
+
373
+ cluster_messages = [messages[i] for i in indices]
374
+ style = self.analyze_cluster_style(cluster_messages)
375
+
376
+ cluster = TopicCluster(
377
+ name=topic,
378
+ messages=cluster_messages[:100], # Keep sample for reference
379
+ message_count=style.get("message_count", len(cluster_messages)),
380
+ avg_length=style.get("avg_length", 0),
381
+ formality=style.get("formality", 0.5),
382
+ emoji_rate=style.get("emoji_rate", 0),
383
+ question_rate=style.get("question_rate", 0),
384
+ exclamation_rate=style.get("exclamation_rate", 0),
385
+ common_phrases=style.get("common_phrases", []),
386
+ language_mix=style.get("language_mix", {}),
387
+ )
388
+ topic_clusters[topic] = cluster
389
+ logger.info(" %s: %d messages, formality=%.2f", topic, len(indices), cluster.formality)
390
+
391
+ # Generate cross-topic insights
392
+ insights = self._generate_insights(topic_clusters)
393
+
394
+ # Generate markdown rules
395
+ markdown = self._generate_markdown(topic_clusters, insights)
396
+
397
+ return SemanticStyleAnalysis(
398
+ topic_clusters=topic_clusters,
399
+ cross_topic_insights=insights,
400
+ style_rules_markdown=markdown,
401
+ )
402
+
403
+ def _generate_insights(self, clusters: dict[str, TopicCluster]) -> list[str]:
404
+ """Generate cross-topic insights by comparing clusters."""
405
+ insights = []
406
+
407
+ if len(clusters) < 2:
408
+ return insights
409
+
410
+ # Find most/least formal topic
411
+ formalities = [(t, c.formality) for t, c in clusters.items()]
412
+ formalities.sort(key=lambda x: x[1])
413
+
414
+ if len(formalities) >= 2:
415
+ most_casual = formalities[0]
416
+ most_formal = formalities[-1]
417
+ if most_formal[1] - most_casual[1] > 0.1:
418
+ insights.append(
419
+ f"Most formal in '{most_formal[0]}' contexts ({most_formal[1]:.2f}), "
420
+ f"most casual in '{most_casual[0]}' ({most_casual[1]:.2f})"
421
+ )
422
+
423
+ # Find where you use most emoji
424
+ emoji_rates = [(t, c.emoji_rate) for t, c in clusters.items()]
425
+ emoji_rates.sort(key=lambda x: x[1], reverse=True)
426
+ if emoji_rates[0][1] > 0.1:
427
+ insights.append(f"Uses most emoji in '{emoji_rates[0][0]}' contexts ({emoji_rates[0][1]:.2f} per message)")
428
+
429
+ # Language switching patterns
430
+ for topic, cluster in clusters.items():
431
+ hebrew = cluster.language_mix.get("hebrew", 0)
432
+ english = cluster.language_mix.get("english", 0)
433
+ if hebrew > 0.5 and english > 0.5:
434
+ insights.append(f"Frequently code-switches Hebrew/English in '{topic}' context")
435
+ elif hebrew > 0.8:
436
+ insights.append(f"Primarily Hebrew in '{topic}' context")
437
+
438
+ # Message length patterns
439
+ lengths = [(t, c.avg_length) for t, c in clusters.items()]
440
+ lengths.sort(key=lambda x: x[1])
441
+ if lengths[-1][1] > lengths[0][1] * 2:
442
+ insights.append(
443
+ f"Writes longest messages in '{lengths[-1][0]}' ({lengths[-1][1]:.0f} chars avg), "
444
+ f"shortest in '{lengths[0][0]}' ({lengths[0][1]:.0f} chars)"
445
+ )
446
+
447
+ return insights
448
+
449
+ def _generate_markdown(
450
+ self,
451
+ clusters: dict[str, TopicCluster],
452
+ insights: list[str],
453
+ ) -> str:
454
+ """Generate markdown style rules from analysis."""
455
+ lines = [
456
+ "# Your Writing Style (Semantic Analysis)",
457
+ "",
458
+ "Generated from message clustering by topic.",
459
+ "",
460
+ "## Cross-Context Insights",
461
+ "",
462
+ ]
463
+
464
+ for insight in insights:
465
+ lines.append(f"- {insight}")
466
+
467
+ lines.extend(["", "## By Context", ""])
468
+
469
+ for topic, cluster in clusters.items():
470
+ lines.append(f"### {topic.title()}")
471
+ lines.append("")
472
+ lines.append(f"- **Message count:** {cluster.message_count}")
473
+ lines.append(f"- **Average length:** {cluster.avg_length:.0f} characters")
474
+ lines.append(f"- **Formality:** {cluster.formality:.2f} (0=casual, 1=formal)")
475
+ lines.append(f"- **Emoji rate:** {cluster.emoji_rate:.2f} per message")
476
+
477
+ if cluster.language_mix:
478
+ lang_str = ", ".join(f"{lang}: {pct:.0%}" for lang, pct in cluster.language_mix.items())
479
+ lines.append(f"- **Language mix:** {lang_str}")
480
+
481
+ if cluster.common_phrases:
482
+ phrases = ", ".join(f'"{p}"' for p in cluster.common_phrases[:5])
483
+ lines.append(f"- **Common phrases:** {phrases}")
484
+
485
+ lines.append("")
486
+
487
+ lines.extend(
488
+ [
489
+ "## For Cover Letters & Professional Outreach",
490
+ "",
491
+ "Based on your patterns:",
492
+ "",
493
+ ]
494
+ )
495
+
496
+ # Add recommendations based on analysis
497
+ if "professional" in clusters:
498
+ prof = clusters["professional"]
499
+ lines.append(f"- Use formality level ~{prof.formality:.2f} (matches your work context)")
500
+
501
+ if "technical" in clusters:
502
+ tech = clusters["technical"]
503
+ lines.append(f"- Technical explanations avg {tech.avg_length:.0f} chars - keep similar length")
504
+ if tech.common_phrases:
505
+ lines.append(f"- You naturally use phrases like: {', '.join(tech.common_phrases[:3])}")
506
+
507
+ if "emotional" in clusters:
508
+ emo = clusters["emotional"]
509
+ if emo.exclamation_rate > 0.3:
510
+ lines.append("- You express enthusiasm with exclamation marks - use sparingly in professional context")
511
+
512
+ lines.append("- Avoid Hebrew in English-only professional contexts")
513
+ lines.append("")
514
+
515
+ return "\n".join(lines)
516
+
517
+ def save_analysis(
518
+ self,
519
+ analysis: SemanticStyleAnalysis,
520
+ output_dir: Path,
521
+ ) -> None:
522
+ """Save analysis results to files."""
523
+ output_dir.mkdir(parents=True, exist_ok=True)
524
+
525
+ # Save markdown rules
526
+ rules_path = output_dir / "semantic-style-rules.md"
527
+ rules_path.write_text(analysis.style_rules_markdown)
528
+ logger.info("Saved rules to %s", rules_path)
529
+
530
+ # Save JSON data for programmatic use
531
+ data = {
532
+ "topics": {
533
+ name: {
534
+ "message_count": cluster.message_count,
535
+ "avg_length": cluster.avg_length,
536
+ "formality": cluster.formality,
537
+ "emoji_rate": cluster.emoji_rate,
538
+ "question_rate": cluster.question_rate,
539
+ "exclamation_rate": cluster.exclamation_rate,
540
+ "language_mix": cluster.language_mix,
541
+ "common_phrases": cluster.common_phrases,
542
+ }
543
+ for name, cluster in analysis.topic_clusters.items()
544
+ },
545
+ "insights": analysis.cross_topic_insights,
546
+ }
547
+
548
+ json_path = output_dir / "semantic-style-data.json"
549
+ json_path.write_text(json.dumps(data, indent=2, ensure_ascii=False))
550
+ logger.info("Saved data to %s", json_path)
551
+
552
+
553
+ def analyze_semantic_style(
554
+ messages: list[str],
555
+ output_dir: Optional[Path] = None,
556
+ min_cluster_size: int = 10,
557
+ ) -> SemanticStyleAnalysis:
558
+ """Convenience function for semantic style analysis.
559
+
560
+ Args:
561
+ messages: List of message texts
562
+ output_dir: Optional directory to save results
563
+ min_cluster_size: Minimum messages per topic cluster
564
+
565
+ Returns:
566
+ SemanticStyleAnalysis result
567
+ """
568
+ analyzer = SemanticStyleAnalyzer()
569
+ analysis = analyzer.analyze(messages, min_cluster_size=min_cluster_size)
570
+
571
+ if output_dir:
572
+ analyzer.save_analysis(analysis, output_dir)
573
+
574
+ return analysis