@rankcli/agent-runtime 0.0.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +242 -0
- package/dist/analyzer-2CSWIQGD.mjs +6 -0
- package/dist/chunk-YNZYHEYM.mjs +774 -0
- package/dist/index.d.mts +4012 -0
- package/dist/index.d.ts +4012 -0
- package/dist/index.js +29672 -0
- package/dist/index.mjs +28602 -0
- package/package.json +53 -0
- package/scripts/build-deno.ts +134 -0
- package/src/audit/ai/analyzer.ts +347 -0
- package/src/audit/ai/index.ts +29 -0
- package/src/audit/ai/prompts/content-analysis.ts +271 -0
- package/src/audit/ai/types.ts +179 -0
- package/src/audit/checks/additional-checks.ts +439 -0
- package/src/audit/checks/ai-citation-worthiness.ts +399 -0
- package/src/audit/checks/ai-content-structure.ts +325 -0
- package/src/audit/checks/ai-readiness.ts +339 -0
- package/src/audit/checks/anchor-text.ts +179 -0
- package/src/audit/checks/answer-conciseness.ts +322 -0
- package/src/audit/checks/asset-minification.ts +270 -0
- package/src/audit/checks/bing-optimization.ts +206 -0
- package/src/audit/checks/brand-mention-optimization.ts +349 -0
- package/src/audit/checks/caching-headers.ts +305 -0
- package/src/audit/checks/canonical-advanced.ts +150 -0
- package/src/audit/checks/canonical-domain.ts +196 -0
- package/src/audit/checks/citation-quality.ts +358 -0
- package/src/audit/checks/client-rendering.ts +542 -0
- package/src/audit/checks/color-contrast.ts +342 -0
- package/src/audit/checks/content-freshness.ts +170 -0
- package/src/audit/checks/content-science.ts +589 -0
- package/src/audit/checks/conversion-elements.ts +526 -0
- package/src/audit/checks/crawlability.ts +220 -0
- package/src/audit/checks/directory-listing.ts +172 -0
- package/src/audit/checks/dom-analysis.ts +191 -0
- package/src/audit/checks/dom-size.ts +246 -0
- package/src/audit/checks/duplicate-content.ts +194 -0
- package/src/audit/checks/eeat-signals.ts +990 -0
- package/src/audit/checks/entity-seo.ts +396 -0
- package/src/audit/checks/featured-snippet.ts +473 -0
- package/src/audit/checks/freshness-signals.ts +443 -0
- package/src/audit/checks/funnel-intent.ts +463 -0
- package/src/audit/checks/hreflang.ts +174 -0
- package/src/audit/checks/html-compliance.ts +302 -0
- package/src/audit/checks/image-dimensions.ts +167 -0
- package/src/audit/checks/images.ts +160 -0
- package/src/audit/checks/indexnow.ts +275 -0
- package/src/audit/checks/interactive-tools.ts +475 -0
- package/src/audit/checks/internal-link-graph.ts +436 -0
- package/src/audit/checks/keyword-analysis.ts +239 -0
- package/src/audit/checks/keyword-cannibalization.ts +385 -0
- package/src/audit/checks/keyword-placement.ts +471 -0
- package/src/audit/checks/links.ts +203 -0
- package/src/audit/checks/llms-txt.ts +224 -0
- package/src/audit/checks/local-seo.ts +296 -0
- package/src/audit/checks/mobile.ts +167 -0
- package/src/audit/checks/modern-images.ts +226 -0
- package/src/audit/checks/navboost-signals.ts +395 -0
- package/src/audit/checks/on-page.ts +209 -0
- package/src/audit/checks/page-resources.ts +285 -0
- package/src/audit/checks/pagination.ts +180 -0
- package/src/audit/checks/performance.ts +153 -0
- package/src/audit/checks/platform-presence.ts +580 -0
- package/src/audit/checks/redirect-analysis.ts +153 -0
- package/src/audit/checks/redirect-chain.ts +389 -0
- package/src/audit/checks/resource-hints.ts +420 -0
- package/src/audit/checks/responsive-css.ts +247 -0
- package/src/audit/checks/responsive-images.ts +396 -0
- package/src/audit/checks/review-ecosystem.ts +415 -0
- package/src/audit/checks/robots-validation.ts +373 -0
- package/src/audit/checks/security-headers.ts +172 -0
- package/src/audit/checks/security.ts +144 -0
- package/src/audit/checks/serp-preview.ts +251 -0
- package/src/audit/checks/site-maturity.ts +444 -0
- package/src/audit/checks/social-meta.test.ts +275 -0
- package/src/audit/checks/social-meta.ts +134 -0
- package/src/audit/checks/soft-404.ts +151 -0
- package/src/audit/checks/structured-data.ts +238 -0
- package/src/audit/checks/tech-detection.ts +496 -0
- package/src/audit/checks/topical-clusters.ts +435 -0
- package/src/audit/checks/tracker-bloat.ts +462 -0
- package/src/audit/checks/tracking-verification.test.ts +371 -0
- package/src/audit/checks/tracking-verification.ts +636 -0
- package/src/audit/checks/url-safety.ts +682 -0
- package/src/audit/deno-entry.ts +66 -0
- package/src/audit/discovery/index.ts +15 -0
- package/src/audit/discovery/link-crawler.ts +232 -0
- package/src/audit/discovery/repo-routes.ts +347 -0
- package/src/audit/engine.ts +620 -0
- package/src/audit/fixes/index.ts +209 -0
- package/src/audit/fixes/social-meta-fixes.test.ts +329 -0
- package/src/audit/fixes/social-meta-fixes.ts +463 -0
- package/src/audit/index.ts +74 -0
- package/src/audit/runner.test.ts +299 -0
- package/src/audit/runner.ts +130 -0
- package/src/audit/types.ts +1953 -0
- package/src/content/featured-snippet.ts +367 -0
- package/src/content/generator.test.ts +534 -0
- package/src/content/generator.ts +501 -0
- package/src/content/headline.ts +317 -0
- package/src/content/index.ts +62 -0
- package/src/content/intent.ts +258 -0
- package/src/content/keyword-density.ts +349 -0
- package/src/content/readability.ts +262 -0
- package/src/executor.ts +336 -0
- package/src/fixer.ts +416 -0
- package/src/frameworks/detector.test.ts +248 -0
- package/src/frameworks/detector.ts +371 -0
- package/src/frameworks/index.ts +68 -0
- package/src/frameworks/recipes/angular.yaml +171 -0
- package/src/frameworks/recipes/astro.yaml +206 -0
- package/src/frameworks/recipes/django.yaml +180 -0
- package/src/frameworks/recipes/laravel.yaml +137 -0
- package/src/frameworks/recipes/nextjs.yaml +268 -0
- package/src/frameworks/recipes/nuxt.yaml +175 -0
- package/src/frameworks/recipes/rails.yaml +188 -0
- package/src/frameworks/recipes/react.yaml +202 -0
- package/src/frameworks/recipes/sveltekit.yaml +154 -0
- package/src/frameworks/recipes/vue.yaml +137 -0
- package/src/frameworks/recipes/wordpress.yaml +209 -0
- package/src/frameworks/suggestion-engine.ts +320 -0
- package/src/geo/geo-content.test.ts +305 -0
- package/src/geo/geo-content.ts +266 -0
- package/src/geo/geo-history.test.ts +473 -0
- package/src/geo/geo-history.ts +433 -0
- package/src/geo/geo-tracker.test.ts +359 -0
- package/src/geo/geo-tracker.ts +411 -0
- package/src/geo/index.ts +10 -0
- package/src/git/commit-helper.test.ts +261 -0
- package/src/git/commit-helper.ts +329 -0
- package/src/git/index.ts +12 -0
- package/src/git/pr-helper.test.ts +284 -0
- package/src/git/pr-helper.ts +307 -0
- package/src/index.ts +66 -0
- package/src/keywords/ai-keyword-engine.ts +1062 -0
- package/src/keywords/ai-summarizer.ts +387 -0
- package/src/keywords/ci-mode.ts +555 -0
- package/src/keywords/engine.ts +359 -0
- package/src/keywords/index.ts +151 -0
- package/src/keywords/llm-judge.ts +357 -0
- package/src/keywords/nlp-analysis.ts +706 -0
- package/src/keywords/prioritizer.ts +295 -0
- package/src/keywords/site-crawler.ts +342 -0
- package/src/keywords/sources/autocomplete.ts +139 -0
- package/src/keywords/sources/competitive-search.ts +450 -0
- package/src/keywords/sources/competitor-analysis.ts +374 -0
- package/src/keywords/sources/dataforseo.ts +206 -0
- package/src/keywords/sources/free-sources.ts +294 -0
- package/src/keywords/sources/gsc.ts +123 -0
- package/src/keywords/topic-grouping.ts +327 -0
- package/src/keywords/types.ts +144 -0
- package/src/keywords/wizard.ts +457 -0
- package/src/loader.ts +40 -0
- package/src/reports/index.ts +7 -0
- package/src/reports/report-generator.test.ts +293 -0
- package/src/reports/report-generator.ts +713 -0
- package/src/scheduler/alerts.test.ts +458 -0
- package/src/scheduler/alerts.ts +328 -0
- package/src/scheduler/index.ts +8 -0
- package/src/scheduler/scheduled-audit.test.ts +377 -0
- package/src/scheduler/scheduled-audit.ts +149 -0
- package/src/test/integration-test.ts +325 -0
- package/src/tools/analyzer.ts +373 -0
- package/src/tools/crawl.ts +293 -0
- package/src/tools/files.ts +301 -0
- package/src/tools/h1-fixer.ts +249 -0
- package/src/tools/index.ts +67 -0
- package/src/tracking/github-action.ts +326 -0
- package/src/tracking/google-analytics.ts +265 -0
- package/src/tracking/index.ts +45 -0
- package/src/tracking/report-generator.ts +386 -0
- package/src/tracking/search-console.ts +335 -0
- package/src/types.ts +134 -0
- package/src/utils/http.ts +302 -0
- package/src/wasm-adapter.ts +297 -0
- package/src/wasm-entry.ts +14 -0
- package/tsconfig.json +17 -0
- package/tsup.wasm.config.ts +26 -0
- package/vitest.config.ts +15 -0
|
@@ -0,0 +1,706 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Advanced NLP Analysis for Keyword Research
|
|
3
|
+
*
|
|
4
|
+
* Uses data science techniques:
|
|
5
|
+
* - TF-IDF for keyword extraction
|
|
6
|
+
* - N-gram analysis for phrase detection
|
|
7
|
+
* - BM25 scoring for relevance
|
|
8
|
+
* - Keyword clustering using cosine similarity
|
|
9
|
+
* - Embedding-based semantic grouping (OpenAI)
|
|
10
|
+
* - Topic modeling (LDA-inspired)
|
|
11
|
+
*/
|
|
12
|
+
|
|
13
|
+
import OpenAI from 'openai';
|
|
14
|
+
|
|
15
|
+
// Stop words to filter out
|
|
16
|
+
const STOP_WORDS = new Set([
|
|
17
|
+
'a', 'an', 'the', 'and', 'or', 'but', 'in', 'on', 'at', 'to', 'for', 'of', 'with',
|
|
18
|
+
'by', 'from', 'as', 'is', 'was', 'are', 'were', 'been', 'be', 'have', 'has', 'had',
|
|
19
|
+
'do', 'does', 'did', 'will', 'would', 'could', 'should', 'may', 'might', 'must',
|
|
20
|
+
'shall', 'can', 'need', 'dare', 'ought', 'used', 'it', 'its', 'this', 'that',
|
|
21
|
+
'these', 'those', 'i', 'you', 'he', 'she', 'we', 'they', 'what', 'which', 'who',
|
|
22
|
+
'whom', 'when', 'where', 'why', 'how', 'all', 'each', 'every', 'both', 'few',
|
|
23
|
+
'more', 'most', 'other', 'some', 'such', 'no', 'nor', 'not', 'only', 'own', 'same',
|
|
24
|
+
'so', 'than', 'too', 'very', 'just', 'also', 'now', 'here', 'there', 'then', 'once',
|
|
25
|
+
'your', 'our', 'their', 'my', 'his', 'her', 'about', 'after', 'before', 'between',
|
|
26
|
+
'into', 'through', 'during', 'above', 'below', 'up', 'down', 'out', 'off', 'over',
|
|
27
|
+
'under', 'again', 'further', 'any', 'if', 'because', 'until', 'while', 'get', 'got',
|
|
28
|
+
'getting', 'us', 'them', 'me', 'him', 'one', 'two', 'three', 'first', 'second',
|
|
29
|
+
]);
|
|
30
|
+
|
|
31
|
+
export interface TFIDFResult {
|
|
32
|
+
term: string;
|
|
33
|
+
tf: number;
|
|
34
|
+
idf: number;
|
|
35
|
+
tfidf: number;
|
|
36
|
+
documentFrequency: number;
|
|
37
|
+
}
|
|
38
|
+
|
|
39
|
+
export interface NGram {
|
|
40
|
+
phrase: string;
|
|
41
|
+
frequency: number;
|
|
42
|
+
words: number;
|
|
43
|
+
}
|
|
44
|
+
|
|
45
|
+
export interface KeywordCluster {
|
|
46
|
+
id: number;
|
|
47
|
+
name: string;
|
|
48
|
+
keywords: string[];
|
|
49
|
+
centroid?: number[];
|
|
50
|
+
coherenceScore: number;
|
|
51
|
+
}
|
|
52
|
+
|
|
53
|
+
export interface TopicModel {
|
|
54
|
+
topics: Array<{
|
|
55
|
+
id: number;
|
|
56
|
+
name: string;
|
|
57
|
+
keywords: string[];
|
|
58
|
+
weight: number;
|
|
59
|
+
}>;
|
|
60
|
+
documentTopicDistribution: Array<{
|
|
61
|
+
documentId: number;
|
|
62
|
+
topicWeights: number[];
|
|
63
|
+
}>;
|
|
64
|
+
}
|
|
65
|
+
|
|
66
|
+
export interface NLPAnalysisResult {
|
|
67
|
+
tfidfKeywords: TFIDFResult[];
|
|
68
|
+
ngrams: {
|
|
69
|
+
unigrams: NGram[];
|
|
70
|
+
bigrams: NGram[];
|
|
71
|
+
trigrams: NGram[];
|
|
72
|
+
};
|
|
73
|
+
clusters: KeywordCluster[];
|
|
74
|
+
topics: TopicModel;
|
|
75
|
+
entityPhrases: string[];
|
|
76
|
+
semanticGroups: Array<{
|
|
77
|
+
theme: string;
|
|
78
|
+
keywords: string[];
|
|
79
|
+
}>;
|
|
80
|
+
}
|
|
81
|
+
|
|
82
|
+
/**
|
|
83
|
+
* Tokenize text into words
|
|
84
|
+
*/
|
|
85
|
+
export function tokenize(text: string): string[] {
|
|
86
|
+
return text
|
|
87
|
+
.toLowerCase()
|
|
88
|
+
.replace(/[^\w\s'-]/g, ' ')
|
|
89
|
+
.split(/\s+/)
|
|
90
|
+
.filter((word) => word.length > 2 && !STOP_WORDS.has(word))
|
|
91
|
+
.map((word) => word.replace(/^['"-]+|['"-]+$/g, ''));
|
|
92
|
+
}
|
|
93
|
+
|
|
94
|
+
/**
|
|
95
|
+
* Calculate Term Frequency (TF)
|
|
96
|
+
*/
|
|
97
|
+
export function calculateTF(tokens: string[]): Map<string, number> {
|
|
98
|
+
const tf = new Map<string, number>();
|
|
99
|
+
const totalTerms = tokens.length;
|
|
100
|
+
|
|
101
|
+
for (const token of tokens) {
|
|
102
|
+
tf.set(token, (tf.get(token) || 0) + 1);
|
|
103
|
+
}
|
|
104
|
+
|
|
105
|
+
// Normalize by document length
|
|
106
|
+
for (const [term, count] of tf) {
|
|
107
|
+
tf.set(term, count / totalTerms);
|
|
108
|
+
}
|
|
109
|
+
|
|
110
|
+
return tf;
|
|
111
|
+
}
|
|
112
|
+
|
|
113
|
+
/**
|
|
114
|
+
* Calculate Inverse Document Frequency (IDF)
|
|
115
|
+
*/
|
|
116
|
+
export function calculateIDF(documents: string[][]): Map<string, number> {
|
|
117
|
+
const idf = new Map<string, number>();
|
|
118
|
+
const N = documents.length;
|
|
119
|
+
const documentFrequency = new Map<string, number>();
|
|
120
|
+
|
|
121
|
+
// Count document frequency for each term
|
|
122
|
+
for (const doc of documents) {
|
|
123
|
+
const uniqueTerms = new Set(doc);
|
|
124
|
+
for (const term of uniqueTerms) {
|
|
125
|
+
documentFrequency.set(term, (documentFrequency.get(term) || 0) + 1);
|
|
126
|
+
}
|
|
127
|
+
}
|
|
128
|
+
|
|
129
|
+
// Calculate IDF: log(N / df)
|
|
130
|
+
for (const [term, df] of documentFrequency) {
|
|
131
|
+
idf.set(term, Math.log(N / df) + 1); // +1 smoothing
|
|
132
|
+
}
|
|
133
|
+
|
|
134
|
+
return idf;
|
|
135
|
+
}
|
|
136
|
+
|
|
137
|
+
/**
|
|
138
|
+
* Calculate TF-IDF for a corpus
|
|
139
|
+
*/
|
|
140
|
+
export function calculateTFIDF(documents: string[]): TFIDFResult[] {
|
|
141
|
+
const tokenizedDocs = documents.map(tokenize);
|
|
142
|
+
const idf = calculateIDF(tokenizedDocs);
|
|
143
|
+
|
|
144
|
+
const results: TFIDFResult[] = [];
|
|
145
|
+
const globalTF = new Map<string, number>();
|
|
146
|
+
const totalTokens = tokenizedDocs.flat().length;
|
|
147
|
+
|
|
148
|
+
// Aggregate TF across all documents
|
|
149
|
+
for (const doc of tokenizedDocs) {
|
|
150
|
+
for (const token of doc) {
|
|
151
|
+
globalTF.set(token, (globalTF.get(token) || 0) + 1);
|
|
152
|
+
}
|
|
153
|
+
}
|
|
154
|
+
|
|
155
|
+
// Calculate TF-IDF for each term
|
|
156
|
+
for (const [term, count] of globalTF) {
|
|
157
|
+
const tf = count / totalTokens;
|
|
158
|
+
const idfValue = idf.get(term) || 1;
|
|
159
|
+
const tfidf = tf * idfValue;
|
|
160
|
+
|
|
161
|
+
// Count document frequency
|
|
162
|
+
const df = tokenizedDocs.filter((doc) => doc.includes(term)).length;
|
|
163
|
+
|
|
164
|
+
results.push({
|
|
165
|
+
term,
|
|
166
|
+
tf,
|
|
167
|
+
idf: idfValue,
|
|
168
|
+
tfidf,
|
|
169
|
+
documentFrequency: df,
|
|
170
|
+
});
|
|
171
|
+
}
|
|
172
|
+
|
|
173
|
+
// Sort by TF-IDF score
|
|
174
|
+
return results.sort((a, b) => b.tfidf - a.tfidf);
|
|
175
|
+
}
|
|
176
|
+
|
|
177
|
+
/**
|
|
178
|
+
* Extract N-grams from text
|
|
179
|
+
*/
|
|
180
|
+
export function extractNgrams(
|
|
181
|
+
text: string,
|
|
182
|
+
n: number,
|
|
183
|
+
minFrequency: number = 2
|
|
184
|
+
): NGram[] {
|
|
185
|
+
const tokens = tokenize(text);
|
|
186
|
+
const ngrams = new Map<string, number>();
|
|
187
|
+
|
|
188
|
+
for (let i = 0; i <= tokens.length - n; i++) {
|
|
189
|
+
const gram = tokens.slice(i, i + n).join(' ');
|
|
190
|
+
ngrams.set(gram, (ngrams.get(gram) || 0) + 1);
|
|
191
|
+
}
|
|
192
|
+
|
|
193
|
+
return Array.from(ngrams.entries())
|
|
194
|
+
.filter(([_, freq]) => freq >= minFrequency)
|
|
195
|
+
.map(([phrase, frequency]) => ({
|
|
196
|
+
phrase,
|
|
197
|
+
frequency,
|
|
198
|
+
words: n,
|
|
199
|
+
}))
|
|
200
|
+
.sort((a, b) => b.frequency - a.frequency);
|
|
201
|
+
}
|
|
202
|
+
|
|
203
|
+
/**
|
|
204
|
+
* BM25 scoring for keyword relevance
|
|
205
|
+
* BM25 is a bag-of-words retrieval function that ranks documents by relevance
|
|
206
|
+
*/
|
|
207
|
+
export function calculateBM25(
|
|
208
|
+
documents: string[],
|
|
209
|
+
query: string,
|
|
210
|
+
k1: number = 1.5,
|
|
211
|
+
b: number = 0.75
|
|
212
|
+
): Array<{ docIndex: number; score: number }> {
|
|
213
|
+
const tokenizedDocs = documents.map(tokenize);
|
|
214
|
+
const queryTokens = tokenize(query);
|
|
215
|
+
const avgDocLength = tokenizedDocs.reduce((sum, doc) => sum + doc.length, 0) / tokenizedDocs.length;
|
|
216
|
+
const N = documents.length;
|
|
217
|
+
|
|
218
|
+
// Calculate document frequency for query terms
|
|
219
|
+
const df = new Map<string, number>();
|
|
220
|
+
for (const term of queryTokens) {
|
|
221
|
+
df.set(term, tokenizedDocs.filter((doc) => doc.includes(term)).length);
|
|
222
|
+
}
|
|
223
|
+
|
|
224
|
+
const scores: Array<{ docIndex: number; score: number }> = [];
|
|
225
|
+
|
|
226
|
+
for (let i = 0; i < tokenizedDocs.length; i++) {
|
|
227
|
+
const doc = tokenizedDocs[i];
|
|
228
|
+
let score = 0;
|
|
229
|
+
|
|
230
|
+
for (const term of queryTokens) {
|
|
231
|
+
const termFreq = doc.filter((t) => t === term).length;
|
|
232
|
+
const docFreq = df.get(term) || 0;
|
|
233
|
+
|
|
234
|
+
if (docFreq === 0) continue;
|
|
235
|
+
|
|
236
|
+
// IDF component
|
|
237
|
+
const idf = Math.log((N - docFreq + 0.5) / (docFreq + 0.5) + 1);
|
|
238
|
+
|
|
239
|
+
// TF component with length normalization
|
|
240
|
+
const tfNorm =
|
|
241
|
+
(termFreq * (k1 + 1)) /
|
|
242
|
+
(termFreq + k1 * (1 - b + b * (doc.length / avgDocLength)));
|
|
243
|
+
|
|
244
|
+
score += idf * tfNorm;
|
|
245
|
+
}
|
|
246
|
+
|
|
247
|
+
scores.push({ docIndex: i, score });
|
|
248
|
+
}
|
|
249
|
+
|
|
250
|
+
return scores.sort((a, b) => b.score - a.score);
|
|
251
|
+
}
|
|
252
|
+
|
|
253
|
+
/**
|
|
254
|
+
* Cosine similarity between two vectors
|
|
255
|
+
*/
|
|
256
|
+
export function cosineSimilarity(a: number[], b: number[]): number {
|
|
257
|
+
if (a.length !== b.length) return 0;
|
|
258
|
+
|
|
259
|
+
let dotProduct = 0;
|
|
260
|
+
let normA = 0;
|
|
261
|
+
let normB = 0;
|
|
262
|
+
|
|
263
|
+
for (let i = 0; i < a.length; i++) {
|
|
264
|
+
dotProduct += a[i] * b[i];
|
|
265
|
+
normA += a[i] * a[i];
|
|
266
|
+
normB += b[i] * b[i];
|
|
267
|
+
}
|
|
268
|
+
|
|
269
|
+
if (normA === 0 || normB === 0) return 0;
|
|
270
|
+
return dotProduct / (Math.sqrt(normA) * Math.sqrt(normB));
|
|
271
|
+
}
|
|
272
|
+
|
|
273
|
+
/**
|
|
274
|
+
* K-Means clustering for keywords
|
|
275
|
+
*/
|
|
276
|
+
export function kMeansClustering(
|
|
277
|
+
vectors: number[][],
|
|
278
|
+
k: number,
|
|
279
|
+
maxIterations: number = 100
|
|
280
|
+
): number[] {
|
|
281
|
+
if (vectors.length === 0 || k <= 0) return [];
|
|
282
|
+
if (vectors.length <= k) return vectors.map((_, i) => i);
|
|
283
|
+
|
|
284
|
+
const dim = vectors[0].length;
|
|
285
|
+
|
|
286
|
+
// Initialize centroids randomly
|
|
287
|
+
const centroids: number[][] = [];
|
|
288
|
+
const usedIndices = new Set<number>();
|
|
289
|
+
while (centroids.length < k) {
|
|
290
|
+
const idx = Math.floor(Math.random() * vectors.length);
|
|
291
|
+
if (!usedIndices.has(idx)) {
|
|
292
|
+
usedIndices.add(idx);
|
|
293
|
+
centroids.push([...vectors[idx]]);
|
|
294
|
+
}
|
|
295
|
+
}
|
|
296
|
+
|
|
297
|
+
let assignments: number[] = new Array(vectors.length).fill(0);
|
|
298
|
+
let changed = true;
|
|
299
|
+
let iterations = 0;
|
|
300
|
+
|
|
301
|
+
while (changed && iterations < maxIterations) {
|
|
302
|
+
changed = false;
|
|
303
|
+
iterations++;
|
|
304
|
+
|
|
305
|
+
// Assign points to nearest centroid
|
|
306
|
+
for (let i = 0; i < vectors.length; i++) {
|
|
307
|
+
let minDist = Infinity;
|
|
308
|
+
let bestCluster = 0;
|
|
309
|
+
|
|
310
|
+
for (let j = 0; j < k; j++) {
|
|
311
|
+
const dist = 1 - cosineSimilarity(vectors[i], centroids[j]);
|
|
312
|
+
if (dist < minDist) {
|
|
313
|
+
minDist = dist;
|
|
314
|
+
bestCluster = j;
|
|
315
|
+
}
|
|
316
|
+
}
|
|
317
|
+
|
|
318
|
+
if (assignments[i] !== bestCluster) {
|
|
319
|
+
assignments[i] = bestCluster;
|
|
320
|
+
changed = true;
|
|
321
|
+
}
|
|
322
|
+
}
|
|
323
|
+
|
|
324
|
+
// Update centroids
|
|
325
|
+
for (let j = 0; j < k; j++) {
|
|
326
|
+
const clusterPoints = vectors.filter((_, i) => assignments[i] === j);
|
|
327
|
+
if (clusterPoints.length === 0) continue;
|
|
328
|
+
|
|
329
|
+
for (let d = 0; d < dim; d++) {
|
|
330
|
+
centroids[j][d] = clusterPoints.reduce((sum, p) => sum + p[d], 0) / clusterPoints.length;
|
|
331
|
+
}
|
|
332
|
+
}
|
|
333
|
+
}
|
|
334
|
+
|
|
335
|
+
return assignments;
|
|
336
|
+
}
|
|
337
|
+
|
|
338
|
+
/**
|
|
339
|
+
* Create term vectors for clustering (bag of words)
|
|
340
|
+
*/
|
|
341
|
+
export function createTermVectors(
|
|
342
|
+
keywords: string[],
|
|
343
|
+
vocabulary: string[]
|
|
344
|
+
): number[][] {
|
|
345
|
+
const vocabIndex = new Map<string, number>();
|
|
346
|
+
vocabulary.forEach((word, i) => vocabIndex.set(word, i));
|
|
347
|
+
|
|
348
|
+
return keywords.map((keyword) => {
|
|
349
|
+
const vector = new Array(vocabulary.length).fill(0);
|
|
350
|
+
const tokens = tokenize(keyword);
|
|
351
|
+
for (const token of tokens) {
|
|
352
|
+
const idx = vocabIndex.get(token);
|
|
353
|
+
if (idx !== undefined) {
|
|
354
|
+
vector[idx] = 1;
|
|
355
|
+
}
|
|
356
|
+
}
|
|
357
|
+
return vector;
|
|
358
|
+
});
|
|
359
|
+
}
|
|
360
|
+
|
|
361
|
+
/**
|
|
362
|
+
* Get embeddings from OpenAI for semantic clustering
|
|
363
|
+
*/
|
|
364
|
+
export async function getEmbeddings(
|
|
365
|
+
texts: string[],
|
|
366
|
+
openai: OpenAI
|
|
367
|
+
): Promise<number[][]> {
|
|
368
|
+
const embeddings: number[][] = [];
|
|
369
|
+
|
|
370
|
+
// Process in batches of 100 (OpenAI limit)
|
|
371
|
+
const batchSize = 100;
|
|
372
|
+
for (let i = 0; i < texts.length; i += batchSize) {
|
|
373
|
+
const batch = texts.slice(i, i + batchSize);
|
|
374
|
+
|
|
375
|
+
try {
|
|
376
|
+
const response = await openai.embeddings.create({
|
|
377
|
+
model: 'text-embedding-3-small',
|
|
378
|
+
input: batch,
|
|
379
|
+
});
|
|
380
|
+
|
|
381
|
+
for (const item of response.data) {
|
|
382
|
+
embeddings.push(item.embedding);
|
|
383
|
+
}
|
|
384
|
+
} catch (error) {
|
|
385
|
+
// Fallback: create simple bag-of-words vectors
|
|
386
|
+
console.warn('Embedding API failed, using fallback BOW vectors');
|
|
387
|
+
for (const text of batch) {
|
|
388
|
+
const tokens = tokenize(text);
|
|
389
|
+
const vector = new Array(1536).fill(0); // Match embedding dimension
|
|
390
|
+
for (const token of tokens) {
|
|
391
|
+
const hash = simpleHash(token) % 1536;
|
|
392
|
+
vector[hash] = 1;
|
|
393
|
+
}
|
|
394
|
+
embeddings.push(vector);
|
|
395
|
+
}
|
|
396
|
+
}
|
|
397
|
+
}
|
|
398
|
+
|
|
399
|
+
return embeddings;
|
|
400
|
+
}
|
|
401
|
+
|
|
402
|
+
/**
|
|
403
|
+
* Simple hash function for fallback vectors
|
|
404
|
+
*/
|
|
405
|
+
function simpleHash(str: string): number {
|
|
406
|
+
let hash = 0;
|
|
407
|
+
for (let i = 0; i < str.length; i++) {
|
|
408
|
+
const char = str.charCodeAt(i);
|
|
409
|
+
hash = (hash << 5) - hash + char;
|
|
410
|
+
hash = hash & hash;
|
|
411
|
+
}
|
|
412
|
+
return Math.abs(hash);
|
|
413
|
+
}
|
|
414
|
+
|
|
415
|
+
/**
|
|
416
|
+
* Cluster keywords using embeddings
|
|
417
|
+
*/
|
|
418
|
+
export async function clusterKeywordsByEmbedding(
|
|
419
|
+
keywords: string[],
|
|
420
|
+
openai: OpenAI,
|
|
421
|
+
numClusters?: number
|
|
422
|
+
): Promise<KeywordCluster[]> {
|
|
423
|
+
if (keywords.length === 0) return [];
|
|
424
|
+
|
|
425
|
+
// Get embeddings
|
|
426
|
+
const embeddings = await getEmbeddings(keywords, openai);
|
|
427
|
+
|
|
428
|
+
// Determine optimal number of clusters (elbow method approximation)
|
|
429
|
+
const k = numClusters || Math.min(Math.max(3, Math.floor(keywords.length / 5)), 10);
|
|
430
|
+
|
|
431
|
+
// Run K-means
|
|
432
|
+
const assignments = kMeansClustering(embeddings, k);
|
|
433
|
+
|
|
434
|
+
// Group keywords by cluster
|
|
435
|
+
const clusters: KeywordCluster[] = [];
|
|
436
|
+
for (let i = 0; i < k; i++) {
|
|
437
|
+
const clusterKeywords = keywords.filter((_, j) => assignments[j] === i);
|
|
438
|
+
if (clusterKeywords.length === 0) continue;
|
|
439
|
+
|
|
440
|
+
// Calculate centroid
|
|
441
|
+
const clusterEmbeddings = embeddings.filter((_, j) => assignments[j] === i);
|
|
442
|
+
const centroid = clusterEmbeddings[0].map((_, d) =>
|
|
443
|
+
clusterEmbeddings.reduce((sum, e) => sum + e[d], 0) / clusterEmbeddings.length
|
|
444
|
+
);
|
|
445
|
+
|
|
446
|
+
// Calculate coherence (average pairwise similarity)
|
|
447
|
+
let coherence = 1;
|
|
448
|
+
if (clusterEmbeddings.length > 1) {
|
|
449
|
+
let totalSim = 0;
|
|
450
|
+
let pairs = 0;
|
|
451
|
+
for (let j = 0; j < clusterEmbeddings.length; j++) {
|
|
452
|
+
for (let l = j + 1; l < clusterEmbeddings.length; l++) {
|
|
453
|
+
totalSim += cosineSimilarity(clusterEmbeddings[j], clusterEmbeddings[l]);
|
|
454
|
+
pairs++;
|
|
455
|
+
}
|
|
456
|
+
}
|
|
457
|
+
coherence = pairs > 0 ? totalSim / pairs : 1;
|
|
458
|
+
}
|
|
459
|
+
|
|
460
|
+
// Generate cluster name from most common terms
|
|
461
|
+
const allTokens = clusterKeywords.flatMap(tokenize);
|
|
462
|
+
const tokenFreq = new Map<string, number>();
|
|
463
|
+
for (const token of allTokens) {
|
|
464
|
+
tokenFreq.set(token, (tokenFreq.get(token) || 0) + 1);
|
|
465
|
+
}
|
|
466
|
+
const topTokens = Array.from(tokenFreq.entries())
|
|
467
|
+
.sort((a, b) => b[1] - a[1])
|
|
468
|
+
.slice(0, 3)
|
|
469
|
+
.map(([t]) => t);
|
|
470
|
+
|
|
471
|
+
clusters.push({
|
|
472
|
+
id: i,
|
|
473
|
+
name: topTokens.join(' + '),
|
|
474
|
+
keywords: clusterKeywords,
|
|
475
|
+
centroid,
|
|
476
|
+
coherenceScore: coherence,
|
|
477
|
+
});
|
|
478
|
+
}
|
|
479
|
+
|
|
480
|
+
return clusters.sort((a, b) => b.coherenceScore - a.coherenceScore);
|
|
481
|
+
}
|
|
482
|
+
|
|
483
|
+
/**
|
|
484
|
+
* Extract entity-like phrases (capitalized sequences, proper nouns)
|
|
485
|
+
*/
|
|
486
|
+
export function extractEntityPhrases(text: string): string[] {
|
|
487
|
+
const entities = new Set<string>();
|
|
488
|
+
|
|
489
|
+
// Match capitalized word sequences (potential proper nouns/entities)
|
|
490
|
+
const capitalizedPattern = /\b([A-Z][a-z]+(?:\s+[A-Z][a-z]+)*)\b/g;
|
|
491
|
+
let match;
|
|
492
|
+
while ((match = capitalizedPattern.exec(text)) !== null) {
|
|
493
|
+
const phrase = match[1];
|
|
494
|
+
if (phrase.length > 2 && !STOP_WORDS.has(phrase.toLowerCase())) {
|
|
495
|
+
entities.add(phrase);
|
|
496
|
+
}
|
|
497
|
+
}
|
|
498
|
+
|
|
499
|
+
// Match quoted phrases
|
|
500
|
+
const quotedPattern = /"([^"]+)"/g;
|
|
501
|
+
while ((match = quotedPattern.exec(text)) !== null) {
|
|
502
|
+
const phrase = match[1].trim();
|
|
503
|
+
if (phrase.length > 2 && phrase.length < 50) {
|
|
504
|
+
entities.add(phrase);
|
|
505
|
+
}
|
|
506
|
+
}
|
|
507
|
+
|
|
508
|
+
return Array.from(entities);
|
|
509
|
+
}
|
|
510
|
+
|
|
511
|
+
/**
|
|
512
|
+
* Simplified LDA-inspired topic modeling
|
|
513
|
+
* Groups keywords by co-occurrence patterns
|
|
514
|
+
*/
|
|
515
|
+
export function extractTopics(
|
|
516
|
+
documents: string[],
|
|
517
|
+
numTopics: number = 5
|
|
518
|
+
): TopicModel {
|
|
519
|
+
const tokenizedDocs = documents.map(tokenize);
|
|
520
|
+
|
|
521
|
+
// Build co-occurrence matrix
|
|
522
|
+
const cooccurrence = new Map<string, Map<string, number>>();
|
|
523
|
+
const termFreq = new Map<string, number>();
|
|
524
|
+
|
|
525
|
+
for (const doc of tokenizedDocs) {
|
|
526
|
+
const uniqueTerms = [...new Set(doc)];
|
|
527
|
+
|
|
528
|
+
// Count term frequency
|
|
529
|
+
for (const term of doc) {
|
|
530
|
+
termFreq.set(term, (termFreq.get(term) || 0) + 1);
|
|
531
|
+
}
|
|
532
|
+
|
|
533
|
+
// Count co-occurrences
|
|
534
|
+
for (let i = 0; i < uniqueTerms.length; i++) {
|
|
535
|
+
for (let j = i + 1; j < uniqueTerms.length; j++) {
|
|
536
|
+
const t1 = uniqueTerms[i];
|
|
537
|
+
const t2 = uniqueTerms[j];
|
|
538
|
+
|
|
539
|
+
if (!cooccurrence.has(t1)) cooccurrence.set(t1, new Map());
|
|
540
|
+
if (!cooccurrence.has(t2)) cooccurrence.set(t2, new Map());
|
|
541
|
+
|
|
542
|
+
cooccurrence.get(t1)!.set(t2, (cooccurrence.get(t1)!.get(t2) || 0) + 1);
|
|
543
|
+
cooccurrence.get(t2)!.set(t1, (cooccurrence.get(t2)!.get(t1) || 0) + 1);
|
|
544
|
+
}
|
|
545
|
+
}
|
|
546
|
+
}
|
|
547
|
+
|
|
548
|
+
// Get top terms by frequency
|
|
549
|
+
const topTerms = Array.from(termFreq.entries())
|
|
550
|
+
.sort((a, b) => b[1] - a[1])
|
|
551
|
+
.slice(0, 100)
|
|
552
|
+
.map(([term]) => term);
|
|
553
|
+
|
|
554
|
+
// Create simple topics based on highest co-occurrence clusters
|
|
555
|
+
const usedTerms = new Set<string>();
|
|
556
|
+
const topics: Array<{
|
|
557
|
+
id: number;
|
|
558
|
+
name: string;
|
|
559
|
+
keywords: string[];
|
|
560
|
+
weight: number;
|
|
561
|
+
}> = [];
|
|
562
|
+
|
|
563
|
+
for (let topicId = 0; topicId < numTopics && usedTerms.size < topTerms.length; topicId++) {
|
|
564
|
+
// Find highest-frequency unused term as seed
|
|
565
|
+
const seedTerm = topTerms.find((t) => !usedTerms.has(t));
|
|
566
|
+
if (!seedTerm) break;
|
|
567
|
+
|
|
568
|
+
usedTerms.add(seedTerm);
|
|
569
|
+
const topicKeywords = [seedTerm];
|
|
570
|
+
|
|
571
|
+
// Add co-occurring terms
|
|
572
|
+
const cooccurMap = cooccurrence.get(seedTerm);
|
|
573
|
+
if (cooccurMap) {
|
|
574
|
+
const cooccurTerms = Array.from(cooccurMap.entries())
|
|
575
|
+
.filter(([term]) => !usedTerms.has(term))
|
|
576
|
+
.sort((a, b) => b[1] - a[1])
|
|
577
|
+
.slice(0, 7);
|
|
578
|
+
|
|
579
|
+
for (const [term] of cooccurTerms) {
|
|
580
|
+
topicKeywords.push(term);
|
|
581
|
+
usedTerms.add(term);
|
|
582
|
+
}
|
|
583
|
+
}
|
|
584
|
+
|
|
585
|
+
topics.push({
|
|
586
|
+
id: topicId,
|
|
587
|
+
name: topicKeywords.slice(0, 3).join(', '),
|
|
588
|
+
keywords: topicKeywords,
|
|
589
|
+
weight: termFreq.get(seedTerm) || 0,
|
|
590
|
+
});
|
|
591
|
+
}
|
|
592
|
+
|
|
593
|
+
// Calculate document-topic distribution
|
|
594
|
+
const documentTopicDistribution = tokenizedDocs.map((doc, docId) => {
|
|
595
|
+
const docTerms = new Set(doc);
|
|
596
|
+
const topicWeights = topics.map((topic) => {
|
|
597
|
+
const overlap = topic.keywords.filter((kw) => docTerms.has(kw)).length;
|
|
598
|
+
return overlap / topic.keywords.length;
|
|
599
|
+
});
|
|
600
|
+
|
|
601
|
+
return {
|
|
602
|
+
documentId: docId,
|
|
603
|
+
topicWeights,
|
|
604
|
+
};
|
|
605
|
+
});
|
|
606
|
+
|
|
607
|
+
return {
|
|
608
|
+
topics: topics.sort((a, b) => b.weight - a.weight),
|
|
609
|
+
documentTopicDistribution,
|
|
610
|
+
};
|
|
611
|
+
}
|
|
612
|
+
|
|
613
|
+
/**
|
|
614
|
+
* Run full NLP analysis on content
|
|
615
|
+
*/
|
|
616
|
+
export async function runNLPAnalysis(
|
|
617
|
+
content: string | string[],
|
|
618
|
+
options: {
|
|
619
|
+
openaiApiKey?: string;
|
|
620
|
+
numClusters?: number;
|
|
621
|
+
numTopics?: number;
|
|
622
|
+
} = {}
|
|
623
|
+
): Promise<NLPAnalysisResult> {
|
|
624
|
+
const documents = Array.isArray(content) ? content : [content];
|
|
625
|
+
const combinedText = documents.join(' ');
|
|
626
|
+
|
|
627
|
+
// TF-IDF analysis
|
|
628
|
+
const tfidfKeywords = calculateTFIDF(documents).slice(0, 100);
|
|
629
|
+
|
|
630
|
+
// N-gram extraction
|
|
631
|
+
const unigrams = extractNgrams(combinedText, 1, 3);
|
|
632
|
+
const bigrams = extractNgrams(combinedText, 2, 2);
|
|
633
|
+
const trigrams = extractNgrams(combinedText, 3, 2);
|
|
634
|
+
|
|
635
|
+
// Topic modeling
|
|
636
|
+
const topics = extractTopics(documents, options.numTopics || 5);
|
|
637
|
+
|
|
638
|
+
// Entity extraction
|
|
639
|
+
const entityPhrases = extractEntityPhrases(combinedText);
|
|
640
|
+
|
|
641
|
+
// Keyword clustering (with embeddings if API key provided)
|
|
642
|
+
let clusters: KeywordCluster[] = [];
|
|
643
|
+
const keywordsToCluster = tfidfKeywords.slice(0, 50).map((k) => k.term);
|
|
644
|
+
|
|
645
|
+
if (options.openaiApiKey && keywordsToCluster.length > 0) {
|
|
646
|
+
try {
|
|
647
|
+
const openai = new OpenAI({ apiKey: options.openaiApiKey });
|
|
648
|
+
clusters = await clusterKeywordsByEmbedding(
|
|
649
|
+
keywordsToCluster,
|
|
650
|
+
openai,
|
|
651
|
+
options.numClusters
|
|
652
|
+
);
|
|
653
|
+
} catch (error) {
|
|
654
|
+
console.warn('Embedding clustering failed, using fallback');
|
|
655
|
+
}
|
|
656
|
+
}
|
|
657
|
+
|
|
658
|
+
// Fallback clustering using term vectors
|
|
659
|
+
if (clusters.length === 0 && keywordsToCluster.length > 0) {
|
|
660
|
+
const vocabulary = [...new Set(keywordsToCluster.flatMap(tokenize))];
|
|
661
|
+
const vectors = createTermVectors(keywordsToCluster, vocabulary);
|
|
662
|
+
const k = Math.min(5, Math.max(2, Math.floor(keywordsToCluster.length / 5)));
|
|
663
|
+
const assignments = kMeansClustering(vectors, k);
|
|
664
|
+
|
|
665
|
+
for (let i = 0; i < k; i++) {
|
|
666
|
+
const clusterKws = keywordsToCluster.filter((_, j) => assignments[j] === i);
|
|
667
|
+
if (clusterKws.length === 0) continue;
|
|
668
|
+
|
|
669
|
+
const allTokens = clusterKws.flatMap(tokenize);
|
|
670
|
+
const tokenFreq = new Map<string, number>();
|
|
671
|
+
for (const token of allTokens) {
|
|
672
|
+
tokenFreq.set(token, (tokenFreq.get(token) || 0) + 1);
|
|
673
|
+
}
|
|
674
|
+
const topTokens = Array.from(tokenFreq.entries())
|
|
675
|
+
.sort((a, b) => b[1] - a[1])
|
|
676
|
+
.slice(0, 2)
|
|
677
|
+
.map(([t]) => t);
|
|
678
|
+
|
|
679
|
+
clusters.push({
|
|
680
|
+
id: i,
|
|
681
|
+
name: topTokens.join(' + '),
|
|
682
|
+
keywords: clusterKws,
|
|
683
|
+
coherenceScore: 0.5,
|
|
684
|
+
});
|
|
685
|
+
}
|
|
686
|
+
}
|
|
687
|
+
|
|
688
|
+
// Semantic groups from topics
|
|
689
|
+
const semanticGroups = topics.topics.map((topic) => ({
|
|
690
|
+
theme: topic.name,
|
|
691
|
+
keywords: topic.keywords,
|
|
692
|
+
}));
|
|
693
|
+
|
|
694
|
+
return {
|
|
695
|
+
tfidfKeywords,
|
|
696
|
+
ngrams: {
|
|
697
|
+
unigrams,
|
|
698
|
+
bigrams,
|
|
699
|
+
trigrams,
|
|
700
|
+
},
|
|
701
|
+
clusters,
|
|
702
|
+
topics,
|
|
703
|
+
entityPhrases,
|
|
704
|
+
semanticGroups,
|
|
705
|
+
};
|
|
706
|
+
}
|