persnally 2.1.0 → 2.2.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/build/src/cli.js +18 -0
- package/build/src/daemon.js +4 -0
- package/build/src/dashboard.html +593 -137
- package/build/src/events.d.ts +24 -0
- package/build/src/events.js +10 -0
- package/build/src/importers/extract.js +12 -1
- package/build/src/mcp/index.js +48 -29
- package/build/src/prose.d.ts +10 -0
- package/build/src/prose.js +34 -0
- package/build/src/store.d.ts +8 -0
- package/build/src/store.js +22 -0
- package/build/src/stylometry.d.ts +21 -0
- package/build/src/stylometry.js +124 -0
- package/package.json +1 -1
package/build/src/events.d.ts
CHANGED
|
@@ -59,6 +59,29 @@ export declare const PAYLOAD_SCHEMAS: {
|
|
|
59
59
|
proficiency: z.ZodNumber;
|
|
60
60
|
basis: z.ZodString;
|
|
61
61
|
}, z.core.$strip>;
|
|
62
|
+
readonly "signal.style": z.ZodObject<{
|
|
63
|
+
dimension: z.ZodEnum<{
|
|
64
|
+
format: "format";
|
|
65
|
+
voice: "voice";
|
|
66
|
+
convention: "convention";
|
|
67
|
+
emphasis: "emphasis";
|
|
68
|
+
workflow: "workflow";
|
|
69
|
+
}>;
|
|
70
|
+
pattern: z.ZodString;
|
|
71
|
+
polarity: z.ZodEnum<{
|
|
72
|
+
does: "does";
|
|
73
|
+
avoids: "avoids";
|
|
74
|
+
prefers: "prefers";
|
|
75
|
+
insists: "insists";
|
|
76
|
+
}>;
|
|
77
|
+
confidence: z.ZodNumber;
|
|
78
|
+
evidence: z.ZodString;
|
|
79
|
+
basis: z.ZodEnum<{
|
|
80
|
+
observed: "observed";
|
|
81
|
+
stylometry: "stylometry";
|
|
82
|
+
correction: "correction";
|
|
83
|
+
}>;
|
|
84
|
+
}, z.core.$strip>;
|
|
62
85
|
readonly "context.read": z.ZodObject<{
|
|
63
86
|
scope: z.ZodString;
|
|
64
87
|
client_purpose: z.ZodString;
|
|
@@ -134,6 +157,7 @@ export declare const eventSchema: z.ZodObject<{
|
|
|
134
157
|
"signal.topic": "signal.topic";
|
|
135
158
|
"signal.assertion": "signal.assertion";
|
|
136
159
|
"signal.skill": "signal.skill";
|
|
160
|
+
"signal.style": "signal.style";
|
|
137
161
|
"context.read": "context.read";
|
|
138
162
|
"agent.question": "agent.question";
|
|
139
163
|
"agent.answer": "agent.answer";
|
package/build/src/events.js
CHANGED
|
@@ -33,6 +33,16 @@ export const PAYLOAD_SCHEMAS = {
|
|
|
33
33
|
proficiency: z.number().min(0).max(1),
|
|
34
34
|
basis: z.string(),
|
|
35
35
|
}),
|
|
36
|
+
// How the user writes/works — the prescriptive layer (docs/CONTEXT_DEPTH.md).
|
|
37
|
+
// Structured so it dedupes by `pattern` and consolidates into stable constants.
|
|
38
|
+
"signal.style": z.object({
|
|
39
|
+
dimension: z.enum(["voice", "convention", "emphasis", "format", "workflow"]),
|
|
40
|
+
pattern: z.string().min(1),
|
|
41
|
+
polarity: z.enum(["does", "avoids", "prefers", "insists"]),
|
|
42
|
+
confidence: z.number().min(0).max(1),
|
|
43
|
+
evidence: z.string(),
|
|
44
|
+
basis: z.enum(["observed", "stylometry", "correction"]),
|
|
45
|
+
}),
|
|
36
46
|
"context.read": z.object({
|
|
37
47
|
scope: z.string(),
|
|
38
48
|
client_purpose: z.string(),
|
|
@@ -5,16 +5,23 @@
|
|
|
5
5
|
import { z } from "zod";
|
|
6
6
|
import { newEvent, safeIso, uuidv7, PAYLOAD_SCHEMAS } from "../events.js";
|
|
7
7
|
import { anthropicExtract, DEFAULT_EXTRACT_MODEL } from "../llm.js";
|
|
8
|
+
import { proseLines, stripNoise } from "../prose.js";
|
|
9
|
+
import { analyzeVoice } from "../stylometry.js";
|
|
8
10
|
const MAX_CONVO_CHARS = 30_000;
|
|
9
11
|
const topicsExtraction = z.object({ topics: z.array(PAYLOAD_SCHEMAS["signal.topic"]) });
|
|
10
12
|
const assertionsExtraction = z.object({ assertions: z.array(PAYLOAD_SCHEMAS["signal.assertion"]) });
|
|
11
13
|
export async function extractEvents(parsed, opts, extract = anthropicExtract, model = DEFAULT_EXTRACT_MODEL) {
|
|
12
14
|
const batch = uuidv7();
|
|
13
15
|
const events = [];
|
|
16
|
+
const voiceCorpus = []; // clean prose for the deterministic voice fingerprint
|
|
14
17
|
for (const convo of parsed.conversations) {
|
|
15
18
|
if (!convo.userMessages.length)
|
|
16
19
|
continue;
|
|
17
|
-
const
|
|
20
|
+
const joined = convo.userMessages.join("\n");
|
|
21
|
+
voiceCorpus.push(...proseLines(joined));
|
|
22
|
+
const text = stripNoise(joined).slice(0, MAX_CONVO_CHARS); // strip pasted paths/URLs/logs before the LLM sees it
|
|
23
|
+
if (!text)
|
|
24
|
+
continue;
|
|
18
25
|
const result = await extract({
|
|
19
26
|
model,
|
|
20
27
|
instruction: "Extract 1-5 topic signals from this conversation's user messages. Weight = centrality, depth = engagement level, sentiment = user's attitude toward the topic. Capture decisions and rejected options as their own signals.",
|
|
@@ -42,6 +49,10 @@ export async function extractEvents(parsed, opts, extract = anthropicExtract, mo
|
|
|
42
49
|
events.push(newEvent("signal.assertion", opts.source, a, { kind: "import", batch, file: opts.file }));
|
|
43
50
|
}
|
|
44
51
|
}
|
|
52
|
+
// Deterministic voice fingerprint over the user's own prose — no LLM, no tokens.
|
|
53
|
+
for (const s of analyzeVoice(voiceCorpus).signals) {
|
|
54
|
+
events.push(newEvent("signal.style", opts.source, s, { kind: "import", batch, file: opts.file }));
|
|
55
|
+
}
|
|
45
56
|
const span = parsed.conversations.map((c) => c.created_at).sort();
|
|
46
57
|
events.push(newEvent("system.import", "system", {
|
|
47
58
|
importer: opts.importer,
|
package/build/src/mcp/index.js
CHANGED
|
@@ -49,38 +49,51 @@ async function recordRead(scope, purpose, items) {
|
|
|
49
49
|
}
|
|
50
50
|
}
|
|
51
51
|
// ── persnally_track — write path ────────────────────────────
|
|
52
|
-
|
|
52
|
+
const TOPIC_SCHEMA = z.object({
|
|
53
|
+
topic: z.string().describe("The topic, decision, or preference (e.g. 'Rust async programming', 'chose SQLite over Postgres')"),
|
|
54
|
+
weight: z.number().min(0).max(1),
|
|
55
|
+
intent: z.enum(["learning", "building", "researching", "deciding", "discussing", "debugging"]),
|
|
56
|
+
sentiment: z.enum(["positive", "negative", "neutral"]),
|
|
57
|
+
depth: z.enum(["mention", "moderate", "deep"]),
|
|
58
|
+
category: z.enum(["technology", "business", "finance", "career", "health", "science", "creative", "education", "lifestyle", "news", "other"]),
|
|
59
|
+
entities: z.array(z.string()),
|
|
60
|
+
});
|
|
61
|
+
const STYLE_SCHEMA = z.object({
|
|
62
|
+
dimension: z.enum(["voice", "convention", "emphasis", "format", "workflow"])
|
|
63
|
+
.describe("voice=tone/phrasing; convention=tools/rules; emphasis=what they insist on; format=structure; workflow=how they work"),
|
|
64
|
+
pattern: z.string().min(1).describe("a short, reusable instruction — e.g. 'prefers pnpm over npm', 'wants the falsification first', 'terse, no filler'"),
|
|
65
|
+
polarity: z.enum(["does", "avoids", "prefers", "insists"]),
|
|
66
|
+
confidence: z.number().min(0).max(1).default(0.6),
|
|
67
|
+
evidence: z.string().default("").describe("a brief quote or why you believe it"),
|
|
68
|
+
});
|
|
69
|
+
server.tool("persnally_track", `Track what builds the user's lasting context. Two kinds of signal, both optional — send whichever this conversation produced.
|
|
53
70
|
|
|
54
|
-
|
|
71
|
+
TOPICS — what they're engaged with (interests, decisions, accepted/rejected options).
|
|
72
|
+
- 1-5 per conversation; weight = centrality (0.1 brief … 1.0 main focus); depth = mention|moderate|deep; sentiment 'negative' deprioritizes; entities are specific names ("Next.js", not "web framework").
|
|
55
73
|
|
|
56
|
-
|
|
57
|
-
-
|
|
58
|
-
-
|
|
59
|
-
- Depth: "mention" | "moderate" | "deep" (extensive discussion or problem-solving)
|
|
60
|
-
- Sentiment: "negative" means frustration or dislike (deprioritizes, never boosts)
|
|
61
|
-
- Entities are specific names: "Next.js" not "web framework"
|
|
74
|
+
STYLE — HOW they write and work, so every AI can answer like them. High value, but easy to over-send: record only a CLEAR, REPEATED tell, never a one-off, at most 1-3 per conversation. Examples:
|
|
75
|
+
- voice: "terse, no filler" · convention: "prefers pnpm over npm", "no default exports" · emphasis: "wants the falsification first" · format: "answers in bullet points" · workflow: "kills ideas fast".
|
|
76
|
+
- Skip anything generic or already obvious. When unsure, don't.
|
|
62
77
|
|
|
63
|
-
The user opted in. Only structured signals are stored, locally, never raw messages.`, {
|
|
64
|
-
topics: z.array(
|
|
65
|
-
|
|
66
|
-
|
|
67
|
-
|
|
68
|
-
sentiment: z.enum(["positive", "negative", "neutral"]),
|
|
69
|
-
depth: z.enum(["mention", "moderate", "deep"]),
|
|
70
|
-
category: z.enum(["technology", "business", "finance", "career", "health", "science", "creative", "education", "lifestyle", "news", "other"]),
|
|
71
|
-
entities: z.array(z.string()),
|
|
72
|
-
})).min(1),
|
|
73
|
-
}, async ({ topics }) => guarded(async () => {
|
|
74
|
-
logEvent("tool_call", { tool: "persnally_track", topics: topics.length });
|
|
78
|
+
The user opted in. Only these structured signals are stored, locally, never raw messages.`, {
|
|
79
|
+
topics: z.array(TOPIC_SCHEMA).optional(),
|
|
80
|
+
style: z.array(STYLE_SCHEMA).optional(),
|
|
81
|
+
}, async ({ topics, style }) => guarded(async () => {
|
|
82
|
+
logEvent("tool_call", { tool: "persnally_track", topics: topics?.length ?? 0, style: style?.length ?? 0 });
|
|
75
83
|
const client = clientSlug();
|
|
76
|
-
const events =
|
|
77
|
-
type: "signal.topic",
|
|
78
|
-
source: `mcp:${client}`,
|
|
79
|
-
|
|
80
|
-
|
|
81
|
-
|
|
84
|
+
const events = [
|
|
85
|
+
...(topics ?? []).map((t) => ({ type: "signal.topic", source: `mcp:${client}`, payload: t, provenance: { kind: "mcp", client } })),
|
|
86
|
+
...(style ?? []).map((s) => ({ type: "signal.style", source: `mcp:${client}`, payload: { ...s, basis: "observed" }, provenance: { kind: "mcp", client } })),
|
|
87
|
+
];
|
|
88
|
+
if (!events.length)
|
|
89
|
+
return text("Nothing to track — pass topics and/or style signals.");
|
|
82
90
|
await daemonPost("/events", events);
|
|
83
|
-
|
|
91
|
+
const parts = [];
|
|
92
|
+
if (topics?.length)
|
|
93
|
+
parts.push(`${topics.length} topic(s): ${topics.map((t) => t.topic).join(", ")}`);
|
|
94
|
+
if (style?.length)
|
|
95
|
+
parts.push(`${style.length} style signal(s)`);
|
|
96
|
+
return text(`Recorded ${parts.join(" · ")}.`);
|
|
84
97
|
}));
|
|
85
98
|
// ── persnally_context — read path (the Phase 2 core) ────────
|
|
86
99
|
server.tool("persnally_context", `Get the user's personal context: who they are, what they're working on, and their current interests.
|
|
@@ -91,11 +104,12 @@ Call this at the START of a conversation (or when personalization would improve
|
|
|
91
104
|
}, async ({ detail, purpose }) => guarded(async () => {
|
|
92
105
|
logEvent("tool_call", { tool: "persnally_context", detail });
|
|
93
106
|
const client = encodeURIComponent(getClient());
|
|
94
|
-
const [profile, topics] = await Promise.all([
|
|
107
|
+
const [profile, topics, voice] = await Promise.all([
|
|
95
108
|
daemonGet(`/profile?client=${client}`),
|
|
96
109
|
daemonGet(`/topics?limit=${detail === "full" ? 25 : 10}&client=${client}`),
|
|
110
|
+
daemonGet("/voice"),
|
|
97
111
|
]);
|
|
98
|
-
if (!profile && !topics?.length) {
|
|
112
|
+
if (!profile && !topics?.length && !voice?.pack) {
|
|
99
113
|
return text("No context yet — the user hasn't imported data or tracked any signals.");
|
|
100
114
|
}
|
|
101
115
|
let out = "";
|
|
@@ -106,6 +120,11 @@ Call this at the START of a conversation (or when personalization would improve
|
|
|
106
120
|
items += sections.length;
|
|
107
121
|
out += sections.map((s) => `## ${s.title}\n${s.body}`).join("\n\n");
|
|
108
122
|
}
|
|
123
|
+
// The prescriptive layer: how to write/answer so it fits this user, not a generic one.
|
|
124
|
+
if (voice?.pack) {
|
|
125
|
+
out += `${out ? "\n\n" : ""}# How to write for this user\n${voice.pack}`;
|
|
126
|
+
items += voice.items?.length ?? 0;
|
|
127
|
+
}
|
|
109
128
|
if (topics?.length) {
|
|
110
129
|
out += `\n\n# Current interests (decay-weighted)\n`;
|
|
111
130
|
out += topics.map((t) => `- ${t.topic} (${t.category}, ${t.dominant_intent}, weight ${t.weight.toFixed(2)})`).join("\n");
|
|
@@ -0,0 +1,10 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Corpus hygiene. Imported prompts are polluted with pasted data (file paths,
|
|
3
|
+
* URLs, JSON/logs) and injected blocks (task notifications, reminders, command
|
|
4
|
+
* palettes, tool output). Unfiltered, that noise swamps both topic extraction
|
|
5
|
+
* and the voice fingerprint. See docs/CONTEXT_DEPTH.md.
|
|
6
|
+
*/
|
|
7
|
+
/** Remove injected blocks, fenced code, URLs, and filesystem paths. Keeps prose intact. */
|
|
8
|
+
export declare function stripNoise(text: string): string;
|
|
9
|
+
/** Strict: only the prose lines a human actually wrote — for stylometry. */
|
|
10
|
+
export declare function proseLines(text: string): string[];
|
|
@@ -0,0 +1,34 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Corpus hygiene. Imported prompts are polluted with pasted data (file paths,
|
|
3
|
+
* URLs, JSON/logs) and injected blocks (task notifications, reminders, command
|
|
4
|
+
* palettes, tool output). Unfiltered, that noise swamps both topic extraction
|
|
5
|
+
* and the voice fingerprint. See docs/CONTEXT_DEPTH.md.
|
|
6
|
+
*/
|
|
7
|
+
// A line with at least one of these reads as a sentence, not pasted data.
|
|
8
|
+
const FUNCTION_WORD = /\b(the|a|an|i|to|and|is|it|you|we|that|this|of|for|in|on|do|are|be|can|should|need|want|make|how|what|why|let|so|but|not|just|with|like|now|also|when|if|because|about)\b/;
|
|
9
|
+
/** Remove injected blocks, fenced code, URLs, and filesystem paths. Keeps prose intact. */
|
|
10
|
+
export function stripNoise(text) {
|
|
11
|
+
return text
|
|
12
|
+
.replace(/```[\s\S]*?```/g, " ")
|
|
13
|
+
.replace(/<(?:task-notification|system-reminder|local-command[^>]*|command-[^>]*)>[\s\S]*?<\/[^>]+>/gi, " ")
|
|
14
|
+
.replace(/<\/?[a-z][^>]*>/gi, " ")
|
|
15
|
+
.replace(/https?:\/\/\S+/g, " ")
|
|
16
|
+
.replace(/(?:[~\w.\-]+)?(?:\/[\w.\-]+){2,}\/?/g, " ") // /a/b style paths
|
|
17
|
+
.replace(/[ \t]{2,}/g, " ")
|
|
18
|
+
.replace(/\n{3,}/g, "\n\n")
|
|
19
|
+
.trim();
|
|
20
|
+
}
|
|
21
|
+
/** Strict: only the prose lines a human actually wrote — for stylometry. */
|
|
22
|
+
export function proseLines(text) {
|
|
23
|
+
return stripNoise(text)
|
|
24
|
+
.split("\n")
|
|
25
|
+
.map((l) => l.trim())
|
|
26
|
+
.filter((ln) => {
|
|
27
|
+
if (ln.split(/\s+/).length < 2)
|
|
28
|
+
return false;
|
|
29
|
+
const letters = (ln.match(/[a-zA-Z]/g) || []).length;
|
|
30
|
+
if (!ln.length || letters / ln.length < 0.6)
|
|
31
|
+
return false; // json/logs/ids
|
|
32
|
+
return FUNCTION_WORD.test(" " + ln.toLowerCase() + " ");
|
|
33
|
+
});
|
|
34
|
+
}
|
package/build/src/store.d.ts
CHANGED
|
@@ -3,6 +3,7 @@
|
|
|
3
3
|
* Single source of truth per docs/EVENT_SCHEMA.md; views can always be re-derived.
|
|
4
4
|
*/
|
|
5
5
|
import { type PersnallyEvent } from "./events.js";
|
|
6
|
+
import { type StyleSignal } from "./stylometry.js";
|
|
6
7
|
export declare const DEFAULT_DB_PATH: string;
|
|
7
8
|
export interface QueryOpts {
|
|
8
9
|
type?: string;
|
|
@@ -53,6 +54,13 @@ export declare class EventStore {
|
|
|
53
54
|
rebuild(now?: number): void;
|
|
54
55
|
saveProfile(p: StoredProfile): void;
|
|
55
56
|
getProfile(): StoredProfile | null;
|
|
57
|
+
/** The voice/convention profile — style signals deduped by pattern (newest wins), richest first. */
|
|
58
|
+
voice(): {
|
|
59
|
+
pack: string;
|
|
60
|
+
items: StyleSignal[];
|
|
61
|
+
};
|
|
62
|
+
/** Drops style signals of one basis so a deterministic re-run replaces them (live `observed`/`correction` signals are kept). */
|
|
63
|
+
clearStyleByBasis(basis: string): number;
|
|
56
64
|
/** Hard-deletes matching topic events plus derived events referencing them, then rebuilds. */
|
|
57
65
|
forgetTopic(topic: string): number;
|
|
58
66
|
/** Removes every event from one import batch — a bad import is fully reversible. */
|
package/build/src/store.js
CHANGED
|
@@ -8,6 +8,7 @@ import { dirname, join } from "node:path";
|
|
|
8
8
|
import { topicWeight } from "./decay.js";
|
|
9
9
|
import { normalizeTopic, validateEvent } from "./events.js";
|
|
10
10
|
import { DATA_DIR } from "./paths.js";
|
|
11
|
+
import { assemblePack } from "./stylometry.js";
|
|
11
12
|
const VIEW_SCHEMA_VERSION = 2;
|
|
12
13
|
export const DEFAULT_DB_PATH = join(DATA_DIR, "persnally.db");
|
|
13
14
|
export class EventStore {
|
|
@@ -195,6 +196,27 @@ export class EventStore {
|
|
|
195
196
|
const row = this.db.prepare("SELECT * FROM view_profile WHERE id = 1").get();
|
|
196
197
|
return row ? { ...row, sections: JSON.parse(row.sections) } : null;
|
|
197
198
|
}
|
|
199
|
+
/** The voice/convention profile — style signals deduped by pattern (newest wins), richest first. */
|
|
200
|
+
voice() {
|
|
201
|
+
const byPattern = new Map();
|
|
202
|
+
// query() returns ts DESC, so the first occurrence of a pattern is the most recent.
|
|
203
|
+
for (const e of this.query({ type: "signal.style", limit: 1_000_000 })) {
|
|
204
|
+
const p = e.payload;
|
|
205
|
+
const key = `${p.dimension}|${p.pattern.toLowerCase()}`;
|
|
206
|
+
if (!byPattern.has(key))
|
|
207
|
+
byPattern.set(key, p);
|
|
208
|
+
}
|
|
209
|
+
// Cap the served set: live `observed` signals accrue over time, so bound it
|
|
210
|
+
// to the richest few (consolidation distills further in Slice 3).
|
|
211
|
+
const items = [...byPattern.values()].sort((a, b) => b.confidence - a.confidence).slice(0, 28);
|
|
212
|
+
return { pack: assemblePack(items), items };
|
|
213
|
+
}
|
|
214
|
+
/** Drops style signals of one basis so a deterministic re-run replaces them (live `observed`/`correction` signals are kept). */
|
|
215
|
+
clearStyleByBasis(basis) {
|
|
216
|
+
return this.db
|
|
217
|
+
.prepare("DELETE FROM events WHERE type = 'signal.style' AND json_extract(payload, '$.basis') = ?")
|
|
218
|
+
.run(basis).changes;
|
|
219
|
+
}
|
|
198
220
|
/** Hard-deletes matching topic events plus derived events referencing them, then rebuilds. */
|
|
199
221
|
forgetTopic(topic) {
|
|
200
222
|
const key = normalizeTopic(topic);
|
|
@@ -0,0 +1,21 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Deterministic voice fingerprint — no LLM, no tokens, nothing leaves the machine.
|
|
3
|
+
* Turns the user's own prose (already noise-filtered via prose.ts) into structured
|
|
4
|
+
* signal.style payloads + a prescriptive "voice" pack. See docs/CONTEXT_DEPTH.md.
|
|
5
|
+
*/
|
|
6
|
+
import { z } from "zod";
|
|
7
|
+
import { PAYLOAD_SCHEMAS } from "./events.js";
|
|
8
|
+
export type StyleSignal = z.infer<(typeof PAYLOAD_SCHEMAS)["signal.style"]>;
|
|
9
|
+
export interface VoiceProfile {
|
|
10
|
+
signals: StyleSignal[];
|
|
11
|
+
words: {
|
|
12
|
+
word: string;
|
|
13
|
+
count: number;
|
|
14
|
+
}[];
|
|
15
|
+
pack: string;
|
|
16
|
+
prompts: number;
|
|
17
|
+
}
|
|
18
|
+
/** Compute a voice profile from prose messages (each may be multi-line). */
|
|
19
|
+
export declare function analyzeVoice(messages: string[]): VoiceProfile;
|
|
20
|
+
/** Build the system-prompt-ready "voice" line from style signals (shared by import + serving). */
|
|
21
|
+
export declare function assemblePack(signals: StyleSignal[]): string;
|
|
@@ -0,0 +1,124 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Deterministic voice fingerprint — no LLM, no tokens, nothing leaves the machine.
|
|
3
|
+
* Turns the user's own prose (already noise-filtered via prose.ts) into structured
|
|
4
|
+
* signal.style payloads + a prescriptive "voice" pack. See docs/CONTEXT_DEPTH.md.
|
|
5
|
+
*/
|
|
6
|
+
const STOP = new Set(("a an the and or but if then so of to in on for with at by from as is are was were be been being this that these those it its i you he she we they me my your our their them us do does did done have has had having will would can could should may might must not no yes what which who when where why how all any both each few more most other some such only own same than too very just about into over after before above below up down out off again once here there im ive youre were theyre lets")
|
|
7
|
+
.split(/\s+/));
|
|
8
|
+
const DIRECTIVE = new Set("make fix add remove create give check use keep build write update ensure confirm let lets do run show change implement refactor delete set move find get take generate review test verify explain tell help put start stop send pull push merge commit"
|
|
9
|
+
.split(" "));
|
|
10
|
+
const HEDGE = ["maybe", "i think", "probably", "perhaps", "kind of", "sort of", "i guess", "might be", "not sure", "i feel"];
|
|
11
|
+
const EMOJI = /\p{Extended_Pictographic}/gu;
|
|
12
|
+
const tokenize = (s) => s.toLowerCase().match(/[a-z0-9][a-z0-9']*/g) || [];
|
|
13
|
+
const median = (xs) => {
|
|
14
|
+
if (!xs.length)
|
|
15
|
+
return 0;
|
|
16
|
+
const s = [...xs].sort((a, b) => a - b), m = s.length >> 1;
|
|
17
|
+
return s.length % 2 ? s[m] : (s[m - 1] + s[m]) / 2;
|
|
18
|
+
};
|
|
19
|
+
const allStop = (g) => g.split(" ").every((w) => STOP.has(w) || /^\d+$/.test(w));
|
|
20
|
+
/** Compute a voice profile from prose messages (each may be multi-line). */
|
|
21
|
+
export function analyzeVoice(messages) {
|
|
22
|
+
if (!messages.length)
|
|
23
|
+
return { signals: [], words: [], pack: "", prompts: 0 };
|
|
24
|
+
const uni = new Map(), tri = new Map(), quad = new Map();
|
|
25
|
+
const sentLens = [];
|
|
26
|
+
const wordSet = new Set();
|
|
27
|
+
let total = 0, sent = 0, q = 0, dir = 0, hedge = 0, emoji = 0, lowerI = 0, upperI = 0, please = 0, bulletLines = 0;
|
|
28
|
+
for (const msg of messages) {
|
|
29
|
+
emoji += (msg.match(EMOJI) || []).length;
|
|
30
|
+
lowerI += (msg.match(/(?:^|\s)i(?:'|\s|$)/g) || []).length;
|
|
31
|
+
upperI += (msg.match(/(?:^|\s)I(?:'|\s|$)/g) || []).length;
|
|
32
|
+
for (const ln of msg.split("\n"))
|
|
33
|
+
if (/^\s*[-*•]\s/.test(ln))
|
|
34
|
+
bulletLines++;
|
|
35
|
+
const words = tokenize(msg);
|
|
36
|
+
total += words.length;
|
|
37
|
+
words.forEach((w) => {
|
|
38
|
+
wordSet.add(w);
|
|
39
|
+
if (!STOP.has(w) && w.length >= 4 && !/^\d+$/.test(w))
|
|
40
|
+
uni.set(w, (uni.get(w) || 0) + 1);
|
|
41
|
+
});
|
|
42
|
+
for (let i = 0; i < words.length - 2; i++) {
|
|
43
|
+
const g = words.slice(i, i + 3).join(" ");
|
|
44
|
+
tri.set(g, (tri.get(g) || 0) + 1);
|
|
45
|
+
}
|
|
46
|
+
for (let i = 0; i < words.length - 3; i++) {
|
|
47
|
+
const g = words.slice(i, i + 4).join(" ");
|
|
48
|
+
quad.set(g, (quad.get(g) || 0) + 1);
|
|
49
|
+
}
|
|
50
|
+
for (const raw of msg.match(/[^.!?\n]+[.!?]*/g) || []) {
|
|
51
|
+
const s = raw.trim();
|
|
52
|
+
if (!s)
|
|
53
|
+
continue;
|
|
54
|
+
sent++;
|
|
55
|
+
const sw = tokenize(s);
|
|
56
|
+
if (sw.length)
|
|
57
|
+
sentLens.push(sw.length);
|
|
58
|
+
if (/\?\s*$/.test(s))
|
|
59
|
+
q++;
|
|
60
|
+
const low = " " + s.toLowerCase() + " ";
|
|
61
|
+
if (HEDGE.some((h) => low.includes(h)))
|
|
62
|
+
hedge++;
|
|
63
|
+
if (sw[0] && DIRECTIVE.has(sw[0]))
|
|
64
|
+
dir++;
|
|
65
|
+
if (low.includes(" please ") || low.includes(" thanks") || low.includes("thank you"))
|
|
66
|
+
please++;
|
|
67
|
+
}
|
|
68
|
+
}
|
|
69
|
+
if (!sent)
|
|
70
|
+
return { signals: [], words: [], pack: "", prompts: messages.length };
|
|
71
|
+
const minP = Math.max(3, Math.round(messages.length * 0.01));
|
|
72
|
+
const rate = (n) => n / sent;
|
|
73
|
+
// distinctive repeated phrases — rank by frequency (tiebreak longer); collapse
|
|
74
|
+
// overlapping windows of the same phrase by shared-token overlap, not just substring.
|
|
75
|
+
const phrases = [];
|
|
76
|
+
const keptTokens = [];
|
|
77
|
+
for (const [g, c] of [...quad.entries(), ...tri.entries()]
|
|
78
|
+
.filter(([g, c]) => c >= minP && !allStop(g))
|
|
79
|
+
.sort((a, b) => b[1] - a[1] || b[0].length - a[0].length)) {
|
|
80
|
+
const gt = g.split(" ");
|
|
81
|
+
if (keptTokens.some((k) => gt.filter((w) => k.has(w)).length >= 2))
|
|
82
|
+
continue; // same phrase, different window
|
|
83
|
+
phrases.push({ phrase: g, count: c });
|
|
84
|
+
keptTokens.push(new Set(gt));
|
|
85
|
+
if (phrases.length >= 8)
|
|
86
|
+
break;
|
|
87
|
+
}
|
|
88
|
+
const signals = [];
|
|
89
|
+
const med = median(sentLens);
|
|
90
|
+
const add = (dimension, pattern, polarity, confidence, evidence) => signals.push({ dimension, pattern, polarity, confidence: Math.round(confidence * 100) / 100, evidence, basis: "stylometry" });
|
|
91
|
+
// tone constants
|
|
92
|
+
if (med <= 11)
|
|
93
|
+
add("voice", "terse — short, declarative sentences", "does", 0.85, `median ${med} words/sentence`);
|
|
94
|
+
else if (med >= 18)
|
|
95
|
+
add("voice", "writes in long, detailed sentences", "does", 0.8, `median ${med} words/sentence`);
|
|
96
|
+
if (rate(dir) > 0.15)
|
|
97
|
+
add("voice", "leads with imperatives, minimal preamble", "does", 0.75, `${Math.round(rate(dir) * 100)}% of sentences open with a command verb`);
|
|
98
|
+
if (rate(hedge) < 0.05)
|
|
99
|
+
add("voice", "states things flatly; rarely hedges", "does", 0.8, `hedging in ${Math.round(rate(hedge) * 100)}% of sentences`);
|
|
100
|
+
if (emoji / messages.length < 0.02)
|
|
101
|
+
add("format", "no emoji", "avoids", 0.7, `${emoji} emoji across ${messages.length} prompts`);
|
|
102
|
+
if (lowerI > upperI * 1.3)
|
|
103
|
+
add("format", "casual register — lowercases “i”", "does", 0.7, `“i” ${lowerI}× vs “I” ${upperI}×`);
|
|
104
|
+
if (please < messages.length * 0.05)
|
|
105
|
+
add("voice", "skips pleasantries", "does", 0.6, `${please} please/thanks across ${messages.length} prompts`);
|
|
106
|
+
if (bulletLines > messages.length * 0.25)
|
|
107
|
+
add("format", "structures answers with bullet points", "prefers", 0.65, `${bulletLines} bulleted lines`);
|
|
108
|
+
// recurring phrasing → emphasis (these tend to be the user's repeated instructions/values)
|
|
109
|
+
for (const { phrase, count } of phrases)
|
|
110
|
+
add("emphasis", phrase, "insists", Math.min(0.9, 0.5 + count / (minP * 6)), `${count}×`);
|
|
111
|
+
const words = [...uni.entries()].filter(([, c]) => c >= minP).sort((a, b) => b[1] - a[1]).slice(0, 18).map(([word, count]) => ({ word, count }));
|
|
112
|
+
return { signals, words, pack: assemblePack(signals), prompts: messages.length };
|
|
113
|
+
}
|
|
114
|
+
/** Build the system-prompt-ready "voice" line from style signals (shared by import + serving). */
|
|
115
|
+
export function assemblePack(signals) {
|
|
116
|
+
const tone = signals.filter((s) => s.dimension !== "emphasis").map((s) => s.pattern);
|
|
117
|
+
const phrases = signals.filter((s) => s.dimension === "emphasis").map((s) => `“${s.pattern}”`);
|
|
118
|
+
if (!tone.length && !phrases.length)
|
|
119
|
+
return "";
|
|
120
|
+
const parts = [...tone];
|
|
121
|
+
if (phrases.length)
|
|
122
|
+
parts.push(`recurring phrasing: ${phrases.slice(0, 5).join(", ")}`);
|
|
123
|
+
return `Write like this user: ${parts.join("; ")}.`;
|
|
124
|
+
}
|