@rankcli/agent-runtime 0.0.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (178) hide show
  1. package/README.md +242 -0
  2. package/dist/analyzer-2CSWIQGD.mjs +6 -0
  3. package/dist/chunk-YNZYHEYM.mjs +774 -0
  4. package/dist/index.d.mts +4012 -0
  5. package/dist/index.d.ts +4012 -0
  6. package/dist/index.js +29672 -0
  7. package/dist/index.mjs +28602 -0
  8. package/package.json +53 -0
  9. package/scripts/build-deno.ts +134 -0
  10. package/src/audit/ai/analyzer.ts +347 -0
  11. package/src/audit/ai/index.ts +29 -0
  12. package/src/audit/ai/prompts/content-analysis.ts +271 -0
  13. package/src/audit/ai/types.ts +179 -0
  14. package/src/audit/checks/additional-checks.ts +439 -0
  15. package/src/audit/checks/ai-citation-worthiness.ts +399 -0
  16. package/src/audit/checks/ai-content-structure.ts +325 -0
  17. package/src/audit/checks/ai-readiness.ts +339 -0
  18. package/src/audit/checks/anchor-text.ts +179 -0
  19. package/src/audit/checks/answer-conciseness.ts +322 -0
  20. package/src/audit/checks/asset-minification.ts +270 -0
  21. package/src/audit/checks/bing-optimization.ts +206 -0
  22. package/src/audit/checks/brand-mention-optimization.ts +349 -0
  23. package/src/audit/checks/caching-headers.ts +305 -0
  24. package/src/audit/checks/canonical-advanced.ts +150 -0
  25. package/src/audit/checks/canonical-domain.ts +196 -0
  26. package/src/audit/checks/citation-quality.ts +358 -0
  27. package/src/audit/checks/client-rendering.ts +542 -0
  28. package/src/audit/checks/color-contrast.ts +342 -0
  29. package/src/audit/checks/content-freshness.ts +170 -0
  30. package/src/audit/checks/content-science.ts +589 -0
  31. package/src/audit/checks/conversion-elements.ts +526 -0
  32. package/src/audit/checks/crawlability.ts +220 -0
  33. package/src/audit/checks/directory-listing.ts +172 -0
  34. package/src/audit/checks/dom-analysis.ts +191 -0
  35. package/src/audit/checks/dom-size.ts +246 -0
  36. package/src/audit/checks/duplicate-content.ts +194 -0
  37. package/src/audit/checks/eeat-signals.ts +990 -0
  38. package/src/audit/checks/entity-seo.ts +396 -0
  39. package/src/audit/checks/featured-snippet.ts +473 -0
  40. package/src/audit/checks/freshness-signals.ts +443 -0
  41. package/src/audit/checks/funnel-intent.ts +463 -0
  42. package/src/audit/checks/hreflang.ts +174 -0
  43. package/src/audit/checks/html-compliance.ts +302 -0
  44. package/src/audit/checks/image-dimensions.ts +167 -0
  45. package/src/audit/checks/images.ts +160 -0
  46. package/src/audit/checks/indexnow.ts +275 -0
  47. package/src/audit/checks/interactive-tools.ts +475 -0
  48. package/src/audit/checks/internal-link-graph.ts +436 -0
  49. package/src/audit/checks/keyword-analysis.ts +239 -0
  50. package/src/audit/checks/keyword-cannibalization.ts +385 -0
  51. package/src/audit/checks/keyword-placement.ts +471 -0
  52. package/src/audit/checks/links.ts +203 -0
  53. package/src/audit/checks/llms-txt.ts +224 -0
  54. package/src/audit/checks/local-seo.ts +296 -0
  55. package/src/audit/checks/mobile.ts +167 -0
  56. package/src/audit/checks/modern-images.ts +226 -0
  57. package/src/audit/checks/navboost-signals.ts +395 -0
  58. package/src/audit/checks/on-page.ts +209 -0
  59. package/src/audit/checks/page-resources.ts +285 -0
  60. package/src/audit/checks/pagination.ts +180 -0
  61. package/src/audit/checks/performance.ts +153 -0
  62. package/src/audit/checks/platform-presence.ts +580 -0
  63. package/src/audit/checks/redirect-analysis.ts +153 -0
  64. package/src/audit/checks/redirect-chain.ts +389 -0
  65. package/src/audit/checks/resource-hints.ts +420 -0
  66. package/src/audit/checks/responsive-css.ts +247 -0
  67. package/src/audit/checks/responsive-images.ts +396 -0
  68. package/src/audit/checks/review-ecosystem.ts +415 -0
  69. package/src/audit/checks/robots-validation.ts +373 -0
  70. package/src/audit/checks/security-headers.ts +172 -0
  71. package/src/audit/checks/security.ts +144 -0
  72. package/src/audit/checks/serp-preview.ts +251 -0
  73. package/src/audit/checks/site-maturity.ts +444 -0
  74. package/src/audit/checks/social-meta.test.ts +275 -0
  75. package/src/audit/checks/social-meta.ts +134 -0
  76. package/src/audit/checks/soft-404.ts +151 -0
  77. package/src/audit/checks/structured-data.ts +238 -0
  78. package/src/audit/checks/tech-detection.ts +496 -0
  79. package/src/audit/checks/topical-clusters.ts +435 -0
  80. package/src/audit/checks/tracker-bloat.ts +462 -0
  81. package/src/audit/checks/tracking-verification.test.ts +371 -0
  82. package/src/audit/checks/tracking-verification.ts +636 -0
  83. package/src/audit/checks/url-safety.ts +682 -0
  84. package/src/audit/deno-entry.ts +66 -0
  85. package/src/audit/discovery/index.ts +15 -0
  86. package/src/audit/discovery/link-crawler.ts +232 -0
  87. package/src/audit/discovery/repo-routes.ts +347 -0
  88. package/src/audit/engine.ts +620 -0
  89. package/src/audit/fixes/index.ts +209 -0
  90. package/src/audit/fixes/social-meta-fixes.test.ts +329 -0
  91. package/src/audit/fixes/social-meta-fixes.ts +463 -0
  92. package/src/audit/index.ts +74 -0
  93. package/src/audit/runner.test.ts +299 -0
  94. package/src/audit/runner.ts +130 -0
  95. package/src/audit/types.ts +1953 -0
  96. package/src/content/featured-snippet.ts +367 -0
  97. package/src/content/generator.test.ts +534 -0
  98. package/src/content/generator.ts +501 -0
  99. package/src/content/headline.ts +317 -0
  100. package/src/content/index.ts +62 -0
  101. package/src/content/intent.ts +258 -0
  102. package/src/content/keyword-density.ts +349 -0
  103. package/src/content/readability.ts +262 -0
  104. package/src/executor.ts +336 -0
  105. package/src/fixer.ts +416 -0
  106. package/src/frameworks/detector.test.ts +248 -0
  107. package/src/frameworks/detector.ts +371 -0
  108. package/src/frameworks/index.ts +68 -0
  109. package/src/frameworks/recipes/angular.yaml +171 -0
  110. package/src/frameworks/recipes/astro.yaml +206 -0
  111. package/src/frameworks/recipes/django.yaml +180 -0
  112. package/src/frameworks/recipes/laravel.yaml +137 -0
  113. package/src/frameworks/recipes/nextjs.yaml +268 -0
  114. package/src/frameworks/recipes/nuxt.yaml +175 -0
  115. package/src/frameworks/recipes/rails.yaml +188 -0
  116. package/src/frameworks/recipes/react.yaml +202 -0
  117. package/src/frameworks/recipes/sveltekit.yaml +154 -0
  118. package/src/frameworks/recipes/vue.yaml +137 -0
  119. package/src/frameworks/recipes/wordpress.yaml +209 -0
  120. package/src/frameworks/suggestion-engine.ts +320 -0
  121. package/src/geo/geo-content.test.ts +305 -0
  122. package/src/geo/geo-content.ts +266 -0
  123. package/src/geo/geo-history.test.ts +473 -0
  124. package/src/geo/geo-history.ts +433 -0
  125. package/src/geo/geo-tracker.test.ts +359 -0
  126. package/src/geo/geo-tracker.ts +411 -0
  127. package/src/geo/index.ts +10 -0
  128. package/src/git/commit-helper.test.ts +261 -0
  129. package/src/git/commit-helper.ts +329 -0
  130. package/src/git/index.ts +12 -0
  131. package/src/git/pr-helper.test.ts +284 -0
  132. package/src/git/pr-helper.ts +307 -0
  133. package/src/index.ts +66 -0
  134. package/src/keywords/ai-keyword-engine.ts +1062 -0
  135. package/src/keywords/ai-summarizer.ts +387 -0
  136. package/src/keywords/ci-mode.ts +555 -0
  137. package/src/keywords/engine.ts +359 -0
  138. package/src/keywords/index.ts +151 -0
  139. package/src/keywords/llm-judge.ts +357 -0
  140. package/src/keywords/nlp-analysis.ts +706 -0
  141. package/src/keywords/prioritizer.ts +295 -0
  142. package/src/keywords/site-crawler.ts +342 -0
  143. package/src/keywords/sources/autocomplete.ts +139 -0
  144. package/src/keywords/sources/competitive-search.ts +450 -0
  145. package/src/keywords/sources/competitor-analysis.ts +374 -0
  146. package/src/keywords/sources/dataforseo.ts +206 -0
  147. package/src/keywords/sources/free-sources.ts +294 -0
  148. package/src/keywords/sources/gsc.ts +123 -0
  149. package/src/keywords/topic-grouping.ts +327 -0
  150. package/src/keywords/types.ts +144 -0
  151. package/src/keywords/wizard.ts +457 -0
  152. package/src/loader.ts +40 -0
  153. package/src/reports/index.ts +7 -0
  154. package/src/reports/report-generator.test.ts +293 -0
  155. package/src/reports/report-generator.ts +713 -0
  156. package/src/scheduler/alerts.test.ts +458 -0
  157. package/src/scheduler/alerts.ts +328 -0
  158. package/src/scheduler/index.ts +8 -0
  159. package/src/scheduler/scheduled-audit.test.ts +377 -0
  160. package/src/scheduler/scheduled-audit.ts +149 -0
  161. package/src/test/integration-test.ts +325 -0
  162. package/src/tools/analyzer.ts +373 -0
  163. package/src/tools/crawl.ts +293 -0
  164. package/src/tools/files.ts +301 -0
  165. package/src/tools/h1-fixer.ts +249 -0
  166. package/src/tools/index.ts +67 -0
  167. package/src/tracking/github-action.ts +326 -0
  168. package/src/tracking/google-analytics.ts +265 -0
  169. package/src/tracking/index.ts +45 -0
  170. package/src/tracking/report-generator.ts +386 -0
  171. package/src/tracking/search-console.ts +335 -0
  172. package/src/types.ts +134 -0
  173. package/src/utils/http.ts +302 -0
  174. package/src/wasm-adapter.ts +297 -0
  175. package/src/wasm-entry.ts +14 -0
  176. package/tsconfig.json +17 -0
  177. package/tsup.wasm.config.ts +26 -0
  178. package/vitest.config.ts +15 -0
@@ -0,0 +1,706 @@
1
+ /**
2
+ * Advanced NLP Analysis for Keyword Research
3
+ *
4
+ * Uses data science techniques:
5
+ * - TF-IDF for keyword extraction
6
+ * - N-gram analysis for phrase detection
7
+ * - BM25 scoring for relevance
8
+ * - Keyword clustering using cosine similarity
9
+ * - Embedding-based semantic grouping (OpenAI)
10
+ * - Topic modeling (LDA-inspired)
11
+ */
12
+
13
+ import OpenAI from 'openai';
14
+
15
+ // Stop words to filter out
16
+ const STOP_WORDS = new Set([
17
+ 'a', 'an', 'the', 'and', 'or', 'but', 'in', 'on', 'at', 'to', 'for', 'of', 'with',
18
+ 'by', 'from', 'as', 'is', 'was', 'are', 'were', 'been', 'be', 'have', 'has', 'had',
19
+ 'do', 'does', 'did', 'will', 'would', 'could', 'should', 'may', 'might', 'must',
20
+ 'shall', 'can', 'need', 'dare', 'ought', 'used', 'it', 'its', 'this', 'that',
21
+ 'these', 'those', 'i', 'you', 'he', 'she', 'we', 'they', 'what', 'which', 'who',
22
+ 'whom', 'when', 'where', 'why', 'how', 'all', 'each', 'every', 'both', 'few',
23
+ 'more', 'most', 'other', 'some', 'such', 'no', 'nor', 'not', 'only', 'own', 'same',
24
+ 'so', 'than', 'too', 'very', 'just', 'also', 'now', 'here', 'there', 'then', 'once',
25
+ 'your', 'our', 'their', 'my', 'his', 'her', 'about', 'after', 'before', 'between',
26
+ 'into', 'through', 'during', 'above', 'below', 'up', 'down', 'out', 'off', 'over',
27
+ 'under', 'again', 'further', 'any', 'if', 'because', 'until', 'while', 'get', 'got',
28
+ 'getting', 'us', 'them', 'me', 'him', 'one', 'two', 'three', 'first', 'second',
29
+ ]);
30
+
31
+ export interface TFIDFResult {
32
+ term: string;
33
+ tf: number;
34
+ idf: number;
35
+ tfidf: number;
36
+ documentFrequency: number;
37
+ }
38
+
39
+ export interface NGram {
40
+ phrase: string;
41
+ frequency: number;
42
+ words: number;
43
+ }
44
+
45
+ export interface KeywordCluster {
46
+ id: number;
47
+ name: string;
48
+ keywords: string[];
49
+ centroid?: number[];
50
+ coherenceScore: number;
51
+ }
52
+
53
+ export interface TopicModel {
54
+ topics: Array<{
55
+ id: number;
56
+ name: string;
57
+ keywords: string[];
58
+ weight: number;
59
+ }>;
60
+ documentTopicDistribution: Array<{
61
+ documentId: number;
62
+ topicWeights: number[];
63
+ }>;
64
+ }
65
+
66
+ export interface NLPAnalysisResult {
67
+ tfidfKeywords: TFIDFResult[];
68
+ ngrams: {
69
+ unigrams: NGram[];
70
+ bigrams: NGram[];
71
+ trigrams: NGram[];
72
+ };
73
+ clusters: KeywordCluster[];
74
+ topics: TopicModel;
75
+ entityPhrases: string[];
76
+ semanticGroups: Array<{
77
+ theme: string;
78
+ keywords: string[];
79
+ }>;
80
+ }
81
+
82
+ /**
83
+ * Tokenize text into words
84
+ */
85
+ export function tokenize(text: string): string[] {
86
+ return text
87
+ .toLowerCase()
88
+ .replace(/[^\w\s'-]/g, ' ')
89
+ .split(/\s+/)
90
+ .filter((word) => word.length > 2 && !STOP_WORDS.has(word))
91
+ .map((word) => word.replace(/^['"-]+|['"-]+$/g, ''));
92
+ }
93
+
94
+ /**
95
+ * Calculate Term Frequency (TF)
96
+ */
97
+ export function calculateTF(tokens: string[]): Map<string, number> {
98
+ const tf = new Map<string, number>();
99
+ const totalTerms = tokens.length;
100
+
101
+ for (const token of tokens) {
102
+ tf.set(token, (tf.get(token) || 0) + 1);
103
+ }
104
+
105
+ // Normalize by document length
106
+ for (const [term, count] of tf) {
107
+ tf.set(term, count / totalTerms);
108
+ }
109
+
110
+ return tf;
111
+ }
112
+
113
+ /**
114
+ * Calculate Inverse Document Frequency (IDF)
115
+ */
116
+ export function calculateIDF(documents: string[][]): Map<string, number> {
117
+ const idf = new Map<string, number>();
118
+ const N = documents.length;
119
+ const documentFrequency = new Map<string, number>();
120
+
121
+ // Count document frequency for each term
122
+ for (const doc of documents) {
123
+ const uniqueTerms = new Set(doc);
124
+ for (const term of uniqueTerms) {
125
+ documentFrequency.set(term, (documentFrequency.get(term) || 0) + 1);
126
+ }
127
+ }
128
+
129
+ // Calculate IDF: log(N / df)
130
+ for (const [term, df] of documentFrequency) {
131
+ idf.set(term, Math.log(N / df) + 1); // +1 smoothing
132
+ }
133
+
134
+ return idf;
135
+ }
136
+
137
+ /**
138
+ * Calculate TF-IDF for a corpus
139
+ */
140
+ export function calculateTFIDF(documents: string[]): TFIDFResult[] {
141
+ const tokenizedDocs = documents.map(tokenize);
142
+ const idf = calculateIDF(tokenizedDocs);
143
+
144
+ const results: TFIDFResult[] = [];
145
+ const globalTF = new Map<string, number>();
146
+ const totalTokens = tokenizedDocs.flat().length;
147
+
148
+ // Aggregate TF across all documents
149
+ for (const doc of tokenizedDocs) {
150
+ for (const token of doc) {
151
+ globalTF.set(token, (globalTF.get(token) || 0) + 1);
152
+ }
153
+ }
154
+
155
+ // Calculate TF-IDF for each term
156
+ for (const [term, count] of globalTF) {
157
+ const tf = count / totalTokens;
158
+ const idfValue = idf.get(term) || 1;
159
+ const tfidf = tf * idfValue;
160
+
161
+ // Count document frequency
162
+ const df = tokenizedDocs.filter((doc) => doc.includes(term)).length;
163
+
164
+ results.push({
165
+ term,
166
+ tf,
167
+ idf: idfValue,
168
+ tfidf,
169
+ documentFrequency: df,
170
+ });
171
+ }
172
+
173
+ // Sort by TF-IDF score
174
+ return results.sort((a, b) => b.tfidf - a.tfidf);
175
+ }
176
+
177
+ /**
178
+ * Extract N-grams from text
179
+ */
180
+ export function extractNgrams(
181
+ text: string,
182
+ n: number,
183
+ minFrequency: number = 2
184
+ ): NGram[] {
185
+ const tokens = tokenize(text);
186
+ const ngrams = new Map<string, number>();
187
+
188
+ for (let i = 0; i <= tokens.length - n; i++) {
189
+ const gram = tokens.slice(i, i + n).join(' ');
190
+ ngrams.set(gram, (ngrams.get(gram) || 0) + 1);
191
+ }
192
+
193
+ return Array.from(ngrams.entries())
194
+ .filter(([_, freq]) => freq >= minFrequency)
195
+ .map(([phrase, frequency]) => ({
196
+ phrase,
197
+ frequency,
198
+ words: n,
199
+ }))
200
+ .sort((a, b) => b.frequency - a.frequency);
201
+ }
202
+
203
+ /**
204
+ * BM25 scoring for keyword relevance
205
+ * BM25 is a bag-of-words retrieval function that ranks documents by relevance
206
+ */
207
+ export function calculateBM25(
208
+ documents: string[],
209
+ query: string,
210
+ k1: number = 1.5,
211
+ b: number = 0.75
212
+ ): Array<{ docIndex: number; score: number }> {
213
+ const tokenizedDocs = documents.map(tokenize);
214
+ const queryTokens = tokenize(query);
215
+ const avgDocLength = tokenizedDocs.reduce((sum, doc) => sum + doc.length, 0) / tokenizedDocs.length;
216
+ const N = documents.length;
217
+
218
+ // Calculate document frequency for query terms
219
+ const df = new Map<string, number>();
220
+ for (const term of queryTokens) {
221
+ df.set(term, tokenizedDocs.filter((doc) => doc.includes(term)).length);
222
+ }
223
+
224
+ const scores: Array<{ docIndex: number; score: number }> = [];
225
+
226
+ for (let i = 0; i < tokenizedDocs.length; i++) {
227
+ const doc = tokenizedDocs[i];
228
+ let score = 0;
229
+
230
+ for (const term of queryTokens) {
231
+ const termFreq = doc.filter((t) => t === term).length;
232
+ const docFreq = df.get(term) || 0;
233
+
234
+ if (docFreq === 0) continue;
235
+
236
+ // IDF component
237
+ const idf = Math.log((N - docFreq + 0.5) / (docFreq + 0.5) + 1);
238
+
239
+ // TF component with length normalization
240
+ const tfNorm =
241
+ (termFreq * (k1 + 1)) /
242
+ (termFreq + k1 * (1 - b + b * (doc.length / avgDocLength)));
243
+
244
+ score += idf * tfNorm;
245
+ }
246
+
247
+ scores.push({ docIndex: i, score });
248
+ }
249
+
250
+ return scores.sort((a, b) => b.score - a.score);
251
+ }
252
+
253
+ /**
254
+ * Cosine similarity between two vectors
255
+ */
256
+ export function cosineSimilarity(a: number[], b: number[]): number {
257
+ if (a.length !== b.length) return 0;
258
+
259
+ let dotProduct = 0;
260
+ let normA = 0;
261
+ let normB = 0;
262
+
263
+ for (let i = 0; i < a.length; i++) {
264
+ dotProduct += a[i] * b[i];
265
+ normA += a[i] * a[i];
266
+ normB += b[i] * b[i];
267
+ }
268
+
269
+ if (normA === 0 || normB === 0) return 0;
270
+ return dotProduct / (Math.sqrt(normA) * Math.sqrt(normB));
271
+ }
272
+
273
+ /**
274
+ * K-Means clustering for keywords
275
+ */
276
+ export function kMeansClustering(
277
+ vectors: number[][],
278
+ k: number,
279
+ maxIterations: number = 100
280
+ ): number[] {
281
+ if (vectors.length === 0 || k <= 0) return [];
282
+ if (vectors.length <= k) return vectors.map((_, i) => i);
283
+
284
+ const dim = vectors[0].length;
285
+
286
+ // Initialize centroids randomly
287
+ const centroids: number[][] = [];
288
+ const usedIndices = new Set<number>();
289
+ while (centroids.length < k) {
290
+ const idx = Math.floor(Math.random() * vectors.length);
291
+ if (!usedIndices.has(idx)) {
292
+ usedIndices.add(idx);
293
+ centroids.push([...vectors[idx]]);
294
+ }
295
+ }
296
+
297
+ let assignments: number[] = new Array(vectors.length).fill(0);
298
+ let changed = true;
299
+ let iterations = 0;
300
+
301
+ while (changed && iterations < maxIterations) {
302
+ changed = false;
303
+ iterations++;
304
+
305
+ // Assign points to nearest centroid
306
+ for (let i = 0; i < vectors.length; i++) {
307
+ let minDist = Infinity;
308
+ let bestCluster = 0;
309
+
310
+ for (let j = 0; j < k; j++) {
311
+ const dist = 1 - cosineSimilarity(vectors[i], centroids[j]);
312
+ if (dist < minDist) {
313
+ minDist = dist;
314
+ bestCluster = j;
315
+ }
316
+ }
317
+
318
+ if (assignments[i] !== bestCluster) {
319
+ assignments[i] = bestCluster;
320
+ changed = true;
321
+ }
322
+ }
323
+
324
+ // Update centroids
325
+ for (let j = 0; j < k; j++) {
326
+ const clusterPoints = vectors.filter((_, i) => assignments[i] === j);
327
+ if (clusterPoints.length === 0) continue;
328
+
329
+ for (let d = 0; d < dim; d++) {
330
+ centroids[j][d] = clusterPoints.reduce((sum, p) => sum + p[d], 0) / clusterPoints.length;
331
+ }
332
+ }
333
+ }
334
+
335
+ return assignments;
336
+ }
337
+
338
+ /**
339
+ * Create term vectors for clustering (bag of words)
340
+ */
341
+ export function createTermVectors(
342
+ keywords: string[],
343
+ vocabulary: string[]
344
+ ): number[][] {
345
+ const vocabIndex = new Map<string, number>();
346
+ vocabulary.forEach((word, i) => vocabIndex.set(word, i));
347
+
348
+ return keywords.map((keyword) => {
349
+ const vector = new Array(vocabulary.length).fill(0);
350
+ const tokens = tokenize(keyword);
351
+ for (const token of tokens) {
352
+ const idx = vocabIndex.get(token);
353
+ if (idx !== undefined) {
354
+ vector[idx] = 1;
355
+ }
356
+ }
357
+ return vector;
358
+ });
359
+ }
360
+
361
+ /**
362
+ * Get embeddings from OpenAI for semantic clustering
363
+ */
364
+ export async function getEmbeddings(
365
+ texts: string[],
366
+ openai: OpenAI
367
+ ): Promise<number[][]> {
368
+ const embeddings: number[][] = [];
369
+
370
+ // Process in batches of 100 (OpenAI limit)
371
+ const batchSize = 100;
372
+ for (let i = 0; i < texts.length; i += batchSize) {
373
+ const batch = texts.slice(i, i + batchSize);
374
+
375
+ try {
376
+ const response = await openai.embeddings.create({
377
+ model: 'text-embedding-3-small',
378
+ input: batch,
379
+ });
380
+
381
+ for (const item of response.data) {
382
+ embeddings.push(item.embedding);
383
+ }
384
+ } catch (error) {
385
+ // Fallback: create simple bag-of-words vectors
386
+ console.warn('Embedding API failed, using fallback BOW vectors');
387
+ for (const text of batch) {
388
+ const tokens = tokenize(text);
389
+ const vector = new Array(1536).fill(0); // Match embedding dimension
390
+ for (const token of tokens) {
391
+ const hash = simpleHash(token) % 1536;
392
+ vector[hash] = 1;
393
+ }
394
+ embeddings.push(vector);
395
+ }
396
+ }
397
+ }
398
+
399
+ return embeddings;
400
+ }
401
+
402
+ /**
403
+ * Simple hash function for fallback vectors
404
+ */
405
+ function simpleHash(str: string): number {
406
+ let hash = 0;
407
+ for (let i = 0; i < str.length; i++) {
408
+ const char = str.charCodeAt(i);
409
+ hash = (hash << 5) - hash + char;
410
+ hash = hash & hash;
411
+ }
412
+ return Math.abs(hash);
413
+ }
414
+
415
+ /**
416
+ * Cluster keywords using embeddings
417
+ */
418
+ export async function clusterKeywordsByEmbedding(
419
+ keywords: string[],
420
+ openai: OpenAI,
421
+ numClusters?: number
422
+ ): Promise<KeywordCluster[]> {
423
+ if (keywords.length === 0) return [];
424
+
425
+ // Get embeddings
426
+ const embeddings = await getEmbeddings(keywords, openai);
427
+
428
+ // Determine optimal number of clusters (elbow method approximation)
429
+ const k = numClusters || Math.min(Math.max(3, Math.floor(keywords.length / 5)), 10);
430
+
431
+ // Run K-means
432
+ const assignments = kMeansClustering(embeddings, k);
433
+
434
+ // Group keywords by cluster
435
+ const clusters: KeywordCluster[] = [];
436
+ for (let i = 0; i < k; i++) {
437
+ const clusterKeywords = keywords.filter((_, j) => assignments[j] === i);
438
+ if (clusterKeywords.length === 0) continue;
439
+
440
+ // Calculate centroid
441
+ const clusterEmbeddings = embeddings.filter((_, j) => assignments[j] === i);
442
+ const centroid = clusterEmbeddings[0].map((_, d) =>
443
+ clusterEmbeddings.reduce((sum, e) => sum + e[d], 0) / clusterEmbeddings.length
444
+ );
445
+
446
+ // Calculate coherence (average pairwise similarity)
447
+ let coherence = 1;
448
+ if (clusterEmbeddings.length > 1) {
449
+ let totalSim = 0;
450
+ let pairs = 0;
451
+ for (let j = 0; j < clusterEmbeddings.length; j++) {
452
+ for (let l = j + 1; l < clusterEmbeddings.length; l++) {
453
+ totalSim += cosineSimilarity(clusterEmbeddings[j], clusterEmbeddings[l]);
454
+ pairs++;
455
+ }
456
+ }
457
+ coherence = pairs > 0 ? totalSim / pairs : 1;
458
+ }
459
+
460
+ // Generate cluster name from most common terms
461
+ const allTokens = clusterKeywords.flatMap(tokenize);
462
+ const tokenFreq = new Map<string, number>();
463
+ for (const token of allTokens) {
464
+ tokenFreq.set(token, (tokenFreq.get(token) || 0) + 1);
465
+ }
466
+ const topTokens = Array.from(tokenFreq.entries())
467
+ .sort((a, b) => b[1] - a[1])
468
+ .slice(0, 3)
469
+ .map(([t]) => t);
470
+
471
+ clusters.push({
472
+ id: i,
473
+ name: topTokens.join(' + '),
474
+ keywords: clusterKeywords,
475
+ centroid,
476
+ coherenceScore: coherence,
477
+ });
478
+ }
479
+
480
+ return clusters.sort((a, b) => b.coherenceScore - a.coherenceScore);
481
+ }
482
+
483
+ /**
484
+ * Extract entity-like phrases (capitalized sequences, proper nouns)
485
+ */
486
+ export function extractEntityPhrases(text: string): string[] {
487
+ const entities = new Set<string>();
488
+
489
+ // Match capitalized word sequences (potential proper nouns/entities)
490
+ const capitalizedPattern = /\b([A-Z][a-z]+(?:\s+[A-Z][a-z]+)*)\b/g;
491
+ let match;
492
+ while ((match = capitalizedPattern.exec(text)) !== null) {
493
+ const phrase = match[1];
494
+ if (phrase.length > 2 && !STOP_WORDS.has(phrase.toLowerCase())) {
495
+ entities.add(phrase);
496
+ }
497
+ }
498
+
499
+ // Match quoted phrases
500
+ const quotedPattern = /"([^"]+)"/g;
501
+ while ((match = quotedPattern.exec(text)) !== null) {
502
+ const phrase = match[1].trim();
503
+ if (phrase.length > 2 && phrase.length < 50) {
504
+ entities.add(phrase);
505
+ }
506
+ }
507
+
508
+ return Array.from(entities);
509
+ }
510
+
511
+ /**
512
+ * Simplified LDA-inspired topic modeling
513
+ * Groups keywords by co-occurrence patterns
514
+ */
515
+ export function extractTopics(
516
+ documents: string[],
517
+ numTopics: number = 5
518
+ ): TopicModel {
519
+ const tokenizedDocs = documents.map(tokenize);
520
+
521
+ // Build co-occurrence matrix
522
+ const cooccurrence = new Map<string, Map<string, number>>();
523
+ const termFreq = new Map<string, number>();
524
+
525
+ for (const doc of tokenizedDocs) {
526
+ const uniqueTerms = [...new Set(doc)];
527
+
528
+ // Count term frequency
529
+ for (const term of doc) {
530
+ termFreq.set(term, (termFreq.get(term) || 0) + 1);
531
+ }
532
+
533
+ // Count co-occurrences
534
+ for (let i = 0; i < uniqueTerms.length; i++) {
535
+ for (let j = i + 1; j < uniqueTerms.length; j++) {
536
+ const t1 = uniqueTerms[i];
537
+ const t2 = uniqueTerms[j];
538
+
539
+ if (!cooccurrence.has(t1)) cooccurrence.set(t1, new Map());
540
+ if (!cooccurrence.has(t2)) cooccurrence.set(t2, new Map());
541
+
542
+ cooccurrence.get(t1)!.set(t2, (cooccurrence.get(t1)!.get(t2) || 0) + 1);
543
+ cooccurrence.get(t2)!.set(t1, (cooccurrence.get(t2)!.get(t1) || 0) + 1);
544
+ }
545
+ }
546
+ }
547
+
548
+ // Get top terms by frequency
549
+ const topTerms = Array.from(termFreq.entries())
550
+ .sort((a, b) => b[1] - a[1])
551
+ .slice(0, 100)
552
+ .map(([term]) => term);
553
+
554
+ // Create simple topics based on highest co-occurrence clusters
555
+ const usedTerms = new Set<string>();
556
+ const topics: Array<{
557
+ id: number;
558
+ name: string;
559
+ keywords: string[];
560
+ weight: number;
561
+ }> = [];
562
+
563
+ for (let topicId = 0; topicId < numTopics && usedTerms.size < topTerms.length; topicId++) {
564
+ // Find highest-frequency unused term as seed
565
+ const seedTerm = topTerms.find((t) => !usedTerms.has(t));
566
+ if (!seedTerm) break;
567
+
568
+ usedTerms.add(seedTerm);
569
+ const topicKeywords = [seedTerm];
570
+
571
+ // Add co-occurring terms
572
+ const cooccurMap = cooccurrence.get(seedTerm);
573
+ if (cooccurMap) {
574
+ const cooccurTerms = Array.from(cooccurMap.entries())
575
+ .filter(([term]) => !usedTerms.has(term))
576
+ .sort((a, b) => b[1] - a[1])
577
+ .slice(0, 7);
578
+
579
+ for (const [term] of cooccurTerms) {
580
+ topicKeywords.push(term);
581
+ usedTerms.add(term);
582
+ }
583
+ }
584
+
585
+ topics.push({
586
+ id: topicId,
587
+ name: topicKeywords.slice(0, 3).join(', '),
588
+ keywords: topicKeywords,
589
+ weight: termFreq.get(seedTerm) || 0,
590
+ });
591
+ }
592
+
593
+ // Calculate document-topic distribution
594
+ const documentTopicDistribution = tokenizedDocs.map((doc, docId) => {
595
+ const docTerms = new Set(doc);
596
+ const topicWeights = topics.map((topic) => {
597
+ const overlap = topic.keywords.filter((kw) => docTerms.has(kw)).length;
598
+ return overlap / topic.keywords.length;
599
+ });
600
+
601
+ return {
602
+ documentId: docId,
603
+ topicWeights,
604
+ };
605
+ });
606
+
607
+ return {
608
+ topics: topics.sort((a, b) => b.weight - a.weight),
609
+ documentTopicDistribution,
610
+ };
611
+ }
612
+
613
+ /**
614
+ * Run full NLP analysis on content
615
+ */
616
+ export async function runNLPAnalysis(
617
+ content: string | string[],
618
+ options: {
619
+ openaiApiKey?: string;
620
+ numClusters?: number;
621
+ numTopics?: number;
622
+ } = {}
623
+ ): Promise<NLPAnalysisResult> {
624
+ const documents = Array.isArray(content) ? content : [content];
625
+ const combinedText = documents.join(' ');
626
+
627
+ // TF-IDF analysis
628
+ const tfidfKeywords = calculateTFIDF(documents).slice(0, 100);
629
+
630
+ // N-gram extraction
631
+ const unigrams = extractNgrams(combinedText, 1, 3);
632
+ const bigrams = extractNgrams(combinedText, 2, 2);
633
+ const trigrams = extractNgrams(combinedText, 3, 2);
634
+
635
+ // Topic modeling
636
+ const topics = extractTopics(documents, options.numTopics || 5);
637
+
638
+ // Entity extraction
639
+ const entityPhrases = extractEntityPhrases(combinedText);
640
+
641
+ // Keyword clustering (with embeddings if API key provided)
642
+ let clusters: KeywordCluster[] = [];
643
+ const keywordsToCluster = tfidfKeywords.slice(0, 50).map((k) => k.term);
644
+
645
+ if (options.openaiApiKey && keywordsToCluster.length > 0) {
646
+ try {
647
+ const openai = new OpenAI({ apiKey: options.openaiApiKey });
648
+ clusters = await clusterKeywordsByEmbedding(
649
+ keywordsToCluster,
650
+ openai,
651
+ options.numClusters
652
+ );
653
+ } catch (error) {
654
+ console.warn('Embedding clustering failed, using fallback');
655
+ }
656
+ }
657
+
658
+ // Fallback clustering using term vectors
659
+ if (clusters.length === 0 && keywordsToCluster.length > 0) {
660
+ const vocabulary = [...new Set(keywordsToCluster.flatMap(tokenize))];
661
+ const vectors = createTermVectors(keywordsToCluster, vocabulary);
662
+ const k = Math.min(5, Math.max(2, Math.floor(keywordsToCluster.length / 5)));
663
+ const assignments = kMeansClustering(vectors, k);
664
+
665
+ for (let i = 0; i < k; i++) {
666
+ const clusterKws = keywordsToCluster.filter((_, j) => assignments[j] === i);
667
+ if (clusterKws.length === 0) continue;
668
+
669
+ const allTokens = clusterKws.flatMap(tokenize);
670
+ const tokenFreq = new Map<string, number>();
671
+ for (const token of allTokens) {
672
+ tokenFreq.set(token, (tokenFreq.get(token) || 0) + 1);
673
+ }
674
+ const topTokens = Array.from(tokenFreq.entries())
675
+ .sort((a, b) => b[1] - a[1])
676
+ .slice(0, 2)
677
+ .map(([t]) => t);
678
+
679
+ clusters.push({
680
+ id: i,
681
+ name: topTokens.join(' + '),
682
+ keywords: clusterKws,
683
+ coherenceScore: 0.5,
684
+ });
685
+ }
686
+ }
687
+
688
+ // Semantic groups from topics
689
+ const semanticGroups = topics.topics.map((topic) => ({
690
+ theme: topic.name,
691
+ keywords: topic.keywords,
692
+ }));
693
+
694
+ return {
695
+ tfidfKeywords,
696
+ ngrams: {
697
+ unigrams,
698
+ bigrams,
699
+ trigrams,
700
+ },
701
+ clusters,
702
+ topics,
703
+ entityPhrases,
704
+ semanticGroups,
705
+ };
706
+ }