@rankcli/agent-runtime 0.0.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (178) hide show
  1. package/README.md +242 -0
  2. package/dist/analyzer-2CSWIQGD.mjs +6 -0
  3. package/dist/chunk-YNZYHEYM.mjs +774 -0
  4. package/dist/index.d.mts +4012 -0
  5. package/dist/index.d.ts +4012 -0
  6. package/dist/index.js +29672 -0
  7. package/dist/index.mjs +28602 -0
  8. package/package.json +53 -0
  9. package/scripts/build-deno.ts +134 -0
  10. package/src/audit/ai/analyzer.ts +347 -0
  11. package/src/audit/ai/index.ts +29 -0
  12. package/src/audit/ai/prompts/content-analysis.ts +271 -0
  13. package/src/audit/ai/types.ts +179 -0
  14. package/src/audit/checks/additional-checks.ts +439 -0
  15. package/src/audit/checks/ai-citation-worthiness.ts +399 -0
  16. package/src/audit/checks/ai-content-structure.ts +325 -0
  17. package/src/audit/checks/ai-readiness.ts +339 -0
  18. package/src/audit/checks/anchor-text.ts +179 -0
  19. package/src/audit/checks/answer-conciseness.ts +322 -0
  20. package/src/audit/checks/asset-minification.ts +270 -0
  21. package/src/audit/checks/bing-optimization.ts +206 -0
  22. package/src/audit/checks/brand-mention-optimization.ts +349 -0
  23. package/src/audit/checks/caching-headers.ts +305 -0
  24. package/src/audit/checks/canonical-advanced.ts +150 -0
  25. package/src/audit/checks/canonical-domain.ts +196 -0
  26. package/src/audit/checks/citation-quality.ts +358 -0
  27. package/src/audit/checks/client-rendering.ts +542 -0
  28. package/src/audit/checks/color-contrast.ts +342 -0
  29. package/src/audit/checks/content-freshness.ts +170 -0
  30. package/src/audit/checks/content-science.ts +589 -0
  31. package/src/audit/checks/conversion-elements.ts +526 -0
  32. package/src/audit/checks/crawlability.ts +220 -0
  33. package/src/audit/checks/directory-listing.ts +172 -0
  34. package/src/audit/checks/dom-analysis.ts +191 -0
  35. package/src/audit/checks/dom-size.ts +246 -0
  36. package/src/audit/checks/duplicate-content.ts +194 -0
  37. package/src/audit/checks/eeat-signals.ts +990 -0
  38. package/src/audit/checks/entity-seo.ts +396 -0
  39. package/src/audit/checks/featured-snippet.ts +473 -0
  40. package/src/audit/checks/freshness-signals.ts +443 -0
  41. package/src/audit/checks/funnel-intent.ts +463 -0
  42. package/src/audit/checks/hreflang.ts +174 -0
  43. package/src/audit/checks/html-compliance.ts +302 -0
  44. package/src/audit/checks/image-dimensions.ts +167 -0
  45. package/src/audit/checks/images.ts +160 -0
  46. package/src/audit/checks/indexnow.ts +275 -0
  47. package/src/audit/checks/interactive-tools.ts +475 -0
  48. package/src/audit/checks/internal-link-graph.ts +436 -0
  49. package/src/audit/checks/keyword-analysis.ts +239 -0
  50. package/src/audit/checks/keyword-cannibalization.ts +385 -0
  51. package/src/audit/checks/keyword-placement.ts +471 -0
  52. package/src/audit/checks/links.ts +203 -0
  53. package/src/audit/checks/llms-txt.ts +224 -0
  54. package/src/audit/checks/local-seo.ts +296 -0
  55. package/src/audit/checks/mobile.ts +167 -0
  56. package/src/audit/checks/modern-images.ts +226 -0
  57. package/src/audit/checks/navboost-signals.ts +395 -0
  58. package/src/audit/checks/on-page.ts +209 -0
  59. package/src/audit/checks/page-resources.ts +285 -0
  60. package/src/audit/checks/pagination.ts +180 -0
  61. package/src/audit/checks/performance.ts +153 -0
  62. package/src/audit/checks/platform-presence.ts +580 -0
  63. package/src/audit/checks/redirect-analysis.ts +153 -0
  64. package/src/audit/checks/redirect-chain.ts +389 -0
  65. package/src/audit/checks/resource-hints.ts +420 -0
  66. package/src/audit/checks/responsive-css.ts +247 -0
  67. package/src/audit/checks/responsive-images.ts +396 -0
  68. package/src/audit/checks/review-ecosystem.ts +415 -0
  69. package/src/audit/checks/robots-validation.ts +373 -0
  70. package/src/audit/checks/security-headers.ts +172 -0
  71. package/src/audit/checks/security.ts +144 -0
  72. package/src/audit/checks/serp-preview.ts +251 -0
  73. package/src/audit/checks/site-maturity.ts +444 -0
  74. package/src/audit/checks/social-meta.test.ts +275 -0
  75. package/src/audit/checks/social-meta.ts +134 -0
  76. package/src/audit/checks/soft-404.ts +151 -0
  77. package/src/audit/checks/structured-data.ts +238 -0
  78. package/src/audit/checks/tech-detection.ts +496 -0
  79. package/src/audit/checks/topical-clusters.ts +435 -0
  80. package/src/audit/checks/tracker-bloat.ts +462 -0
  81. package/src/audit/checks/tracking-verification.test.ts +371 -0
  82. package/src/audit/checks/tracking-verification.ts +636 -0
  83. package/src/audit/checks/url-safety.ts +682 -0
  84. package/src/audit/deno-entry.ts +66 -0
  85. package/src/audit/discovery/index.ts +15 -0
  86. package/src/audit/discovery/link-crawler.ts +232 -0
  87. package/src/audit/discovery/repo-routes.ts +347 -0
  88. package/src/audit/engine.ts +620 -0
  89. package/src/audit/fixes/index.ts +209 -0
  90. package/src/audit/fixes/social-meta-fixes.test.ts +329 -0
  91. package/src/audit/fixes/social-meta-fixes.ts +463 -0
  92. package/src/audit/index.ts +74 -0
  93. package/src/audit/runner.test.ts +299 -0
  94. package/src/audit/runner.ts +130 -0
  95. package/src/audit/types.ts +1953 -0
  96. package/src/content/featured-snippet.ts +367 -0
  97. package/src/content/generator.test.ts +534 -0
  98. package/src/content/generator.ts +501 -0
  99. package/src/content/headline.ts +317 -0
  100. package/src/content/index.ts +62 -0
  101. package/src/content/intent.ts +258 -0
  102. package/src/content/keyword-density.ts +349 -0
  103. package/src/content/readability.ts +262 -0
  104. package/src/executor.ts +336 -0
  105. package/src/fixer.ts +416 -0
  106. package/src/frameworks/detector.test.ts +248 -0
  107. package/src/frameworks/detector.ts +371 -0
  108. package/src/frameworks/index.ts +68 -0
  109. package/src/frameworks/recipes/angular.yaml +171 -0
  110. package/src/frameworks/recipes/astro.yaml +206 -0
  111. package/src/frameworks/recipes/django.yaml +180 -0
  112. package/src/frameworks/recipes/laravel.yaml +137 -0
  113. package/src/frameworks/recipes/nextjs.yaml +268 -0
  114. package/src/frameworks/recipes/nuxt.yaml +175 -0
  115. package/src/frameworks/recipes/rails.yaml +188 -0
  116. package/src/frameworks/recipes/react.yaml +202 -0
  117. package/src/frameworks/recipes/sveltekit.yaml +154 -0
  118. package/src/frameworks/recipes/vue.yaml +137 -0
  119. package/src/frameworks/recipes/wordpress.yaml +209 -0
  120. package/src/frameworks/suggestion-engine.ts +320 -0
  121. package/src/geo/geo-content.test.ts +305 -0
  122. package/src/geo/geo-content.ts +266 -0
  123. package/src/geo/geo-history.test.ts +473 -0
  124. package/src/geo/geo-history.ts +433 -0
  125. package/src/geo/geo-tracker.test.ts +359 -0
  126. package/src/geo/geo-tracker.ts +411 -0
  127. package/src/geo/index.ts +10 -0
  128. package/src/git/commit-helper.test.ts +261 -0
  129. package/src/git/commit-helper.ts +329 -0
  130. package/src/git/index.ts +12 -0
  131. package/src/git/pr-helper.test.ts +284 -0
  132. package/src/git/pr-helper.ts +307 -0
  133. package/src/index.ts +66 -0
  134. package/src/keywords/ai-keyword-engine.ts +1062 -0
  135. package/src/keywords/ai-summarizer.ts +387 -0
  136. package/src/keywords/ci-mode.ts +555 -0
  137. package/src/keywords/engine.ts +359 -0
  138. package/src/keywords/index.ts +151 -0
  139. package/src/keywords/llm-judge.ts +357 -0
  140. package/src/keywords/nlp-analysis.ts +706 -0
  141. package/src/keywords/prioritizer.ts +295 -0
  142. package/src/keywords/site-crawler.ts +342 -0
  143. package/src/keywords/sources/autocomplete.ts +139 -0
  144. package/src/keywords/sources/competitive-search.ts +450 -0
  145. package/src/keywords/sources/competitor-analysis.ts +374 -0
  146. package/src/keywords/sources/dataforseo.ts +206 -0
  147. package/src/keywords/sources/free-sources.ts +294 -0
  148. package/src/keywords/sources/gsc.ts +123 -0
  149. package/src/keywords/topic-grouping.ts +327 -0
  150. package/src/keywords/types.ts +144 -0
  151. package/src/keywords/wizard.ts +457 -0
  152. package/src/loader.ts +40 -0
  153. package/src/reports/index.ts +7 -0
  154. package/src/reports/report-generator.test.ts +293 -0
  155. package/src/reports/report-generator.ts +713 -0
  156. package/src/scheduler/alerts.test.ts +458 -0
  157. package/src/scheduler/alerts.ts +328 -0
  158. package/src/scheduler/index.ts +8 -0
  159. package/src/scheduler/scheduled-audit.test.ts +377 -0
  160. package/src/scheduler/scheduled-audit.ts +149 -0
  161. package/src/test/integration-test.ts +325 -0
  162. package/src/tools/analyzer.ts +373 -0
  163. package/src/tools/crawl.ts +293 -0
  164. package/src/tools/files.ts +301 -0
  165. package/src/tools/h1-fixer.ts +249 -0
  166. package/src/tools/index.ts +67 -0
  167. package/src/tracking/github-action.ts +326 -0
  168. package/src/tracking/google-analytics.ts +265 -0
  169. package/src/tracking/index.ts +45 -0
  170. package/src/tracking/report-generator.ts +386 -0
  171. package/src/tracking/search-console.ts +335 -0
  172. package/src/types.ts +134 -0
  173. package/src/utils/http.ts +302 -0
  174. package/src/wasm-adapter.ts +297 -0
  175. package/src/wasm-entry.ts +14 -0
  176. package/tsconfig.json +17 -0
  177. package/tsup.wasm.config.ts +26 -0
  178. package/vitest.config.ts +15 -0
@@ -0,0 +1,295 @@
1
+ // Keyword Prioritization Algorithm
2
+
3
+ import type {
4
+ SiteProfile,
5
+ KeywordData,
6
+ KeywordOpportunity,
7
+ KeywordAction,
8
+ KeywordResearchResult,
9
+ } from './types.js';
10
+ import { getMaxKdThreshold, PRIORITY_WEIGHTS } from './types.js';
11
+
12
+ export function prioritizeKeywords(
13
+ keywords: KeywordData[],
14
+ siteProfile: SiteProfile,
15
+ existingMeta?: { title?: string; description?: string; h1?: string }
16
+ ): KeywordResearchResult {
17
+ const maxKd = getMaxKdThreshold(siteProfile);
18
+
19
+ // Score and categorize each keyword
20
+ const opportunities: KeywordOpportunity[] = keywords.map((kw) => {
21
+ const priorityScore = calculatePriorityScore(kw, siteProfile, maxKd);
22
+ const category = categorizeKeyword(kw.keywordDifficulty, maxKd);
23
+ const suggestedAction = suggestAction(kw, existingMeta, category);
24
+
25
+ return {
26
+ ...kw,
27
+ priorityScore,
28
+ category,
29
+ suggestedAction,
30
+ };
31
+ });
32
+
33
+ // Sort by priority score
34
+ opportunities.sort((a, b) => b.priorityScore - a.priorityScore);
35
+
36
+ // Categorize into buckets
37
+ const quickWins = opportunities.filter((kw) => kw.category === 'quick-win');
38
+ const mediumTerm = opportunities.filter((kw) => kw.category === 'medium-term');
39
+ const longTerm = opportunities.filter((kw) => kw.category === 'long-term');
40
+
41
+ // Generate recommendations
42
+ const recommendations = generateRecommendations(siteProfile, quickWins, mediumTerm);
43
+
44
+ return {
45
+ siteProfile,
46
+ keywords: opportunities,
47
+ quickWins,
48
+ mediumTerm,
49
+ longTerm,
50
+ recommendations,
51
+ maxKdThreshold: maxKd,
52
+ };
53
+ }
54
+
55
+ function calculatePriorityScore(
56
+ keyword: KeywordData,
57
+ profile: SiteProfile,
58
+ maxKd: number
59
+ ): number {
60
+ // Business value score (0-100)
61
+ const businessScore = calculateBusinessValue(keyword, profile.businessGoal);
62
+
63
+ // Difficulty score (inverse - lower KD = higher score)
64
+ let difficultyScore = 0;
65
+ if (keyword.keywordDifficulty <= maxKd) {
66
+ difficultyScore = 100 - (keyword.keywordDifficulty / maxKd) * 100;
67
+ }
68
+
69
+ // Traffic score (normalized, capped at 100)
70
+ const trafficScore = Math.min((keyword.searchVolume / 1000) * 100, 100);
71
+
72
+ // Weighted sum
73
+ const score =
74
+ businessScore * PRIORITY_WEIGHTS.businessValue +
75
+ difficultyScore * PRIORITY_WEIGHTS.difficulty +
76
+ trafficScore * PRIORITY_WEIGHTS.trafficPotential;
77
+
78
+ return Math.round(score);
79
+ }
80
+
81
+ function calculateBusinessValue(keyword: KeywordData, goal: SiteProfile['businessGoal']): number {
82
+ const intent = keyword.intent || inferIntent(keyword.keyword);
83
+
84
+ // Intent weights based on business goal
85
+ const intentWeights: Record<SiteProfile['businessGoal'], Record<string, number>> = {
86
+ signups: {
87
+ transactional: 100,
88
+ commercial: 80,
89
+ informational: 40,
90
+ navigational: 20,
91
+ },
92
+ purchases: {
93
+ transactional: 100,
94
+ commercial: 90,
95
+ informational: 30,
96
+ navigational: 20,
97
+ },
98
+ leads: {
99
+ commercial: 100,
100
+ transactional: 80,
101
+ informational: 60,
102
+ navigational: 20,
103
+ },
104
+ awareness: {
105
+ informational: 100,
106
+ commercial: 60,
107
+ navigational: 40,
108
+ transactional: 30,
109
+ },
110
+ };
111
+
112
+ return intentWeights[goal][intent] || 50;
113
+ }
114
+
115
+ function inferIntent(keyword: string): 'informational' | 'commercial' | 'transactional' | 'navigational' {
116
+ const kw = keyword.toLowerCase();
117
+
118
+ // Transactional indicators
119
+ if (/\b(buy|purchase|order|subscribe|download|get|try|free trial)\b/.test(kw)) {
120
+ return 'transactional';
121
+ }
122
+
123
+ // Commercial investigation
124
+ if (/\b(best|top|review|compare|vs|versus|alternative|pricing|cost)\b/.test(kw)) {
125
+ return 'commercial';
126
+ }
127
+
128
+ // Navigational
129
+ if (/\b(login|sign in|website|official|app)\b/.test(kw)) {
130
+ return 'navigational';
131
+ }
132
+
133
+ // Default to informational
134
+ return 'informational';
135
+ }
136
+
137
+ function categorizeKeyword(kd: number, maxKd: number): 'quick-win' | 'medium-term' | 'long-term' {
138
+ const quickWinThreshold = Math.min(maxKd * 0.5, 15);
139
+ const mediumThreshold = maxKd;
140
+
141
+ if (kd <= quickWinThreshold) return 'quick-win';
142
+ if (kd <= mediumThreshold) return 'medium-term';
143
+ return 'long-term';
144
+ }
145
+
146
+ function suggestAction(
147
+ keyword: KeywordData,
148
+ existingMeta: { title?: string; description?: string; h1?: string } | undefined,
149
+ category: 'quick-win' | 'medium-term' | 'long-term'
150
+ ): KeywordAction {
151
+ const kw = keyword.keyword;
152
+
153
+ // Check if keyword is already in existing meta
154
+ const inTitle = existingMeta?.title?.toLowerCase().includes(kw.toLowerCase());
155
+ const inDescription = existingMeta?.description?.toLowerCase().includes(kw.toLowerCase());
156
+ const inH1 = existingMeta?.h1?.toLowerCase().includes(kw.toLowerCase());
157
+
158
+ // Quick wins: optimize existing elements
159
+ if (category === 'quick-win') {
160
+ if (!inTitle && existingMeta?.title) {
161
+ return {
162
+ type: 'add-to-title',
163
+ description: `Add "${kw}" to your title tag`,
164
+ targetElement: 'title',
165
+ currentValue: existingMeta.title,
166
+ suggestedValue: generateTitleWithKeyword(existingMeta.title, kw),
167
+ };
168
+ }
169
+
170
+ if (!inH1) {
171
+ return {
172
+ type: 'add-to-h1',
173
+ description: `Include "${kw}" in your H1 heading`,
174
+ targetElement: 'h1',
175
+ currentValue: existingMeta?.h1,
176
+ suggestedValue: generateH1WithKeyword(existingMeta?.h1, kw),
177
+ };
178
+ }
179
+
180
+ if (!inDescription && existingMeta?.description) {
181
+ return {
182
+ type: 'add-to-meta',
183
+ description: `Add "${kw}" to your meta description`,
184
+ targetElement: 'meta[name="description"]',
185
+ currentValue: existingMeta.description,
186
+ suggestedValue: generateDescriptionWithKeyword(existingMeta.description, kw),
187
+ };
188
+ }
189
+
190
+ return {
191
+ type: 'optimize-existing',
192
+ description: `Optimize existing content for "${kw}"`,
193
+ };
194
+ }
195
+
196
+ // Medium-term: may need content creation
197
+ if (category === 'medium-term') {
198
+ return {
199
+ type: 'create-content',
200
+ description: `Create dedicated content targeting "${kw}"`,
201
+ };
202
+ }
203
+
204
+ // Long-term: need authority building first
205
+ return {
206
+ type: 'create-content',
207
+ description: `Build authority first, then target "${kw}" (high competition)`,
208
+ };
209
+ }
210
+
211
+ function generateTitleWithKeyword(currentTitle: string, keyword: string): string {
212
+ // If title is short, prepend keyword
213
+ if (currentTitle.length < 30) {
214
+ return `${capitalizeFirst(keyword)} - ${currentTitle}`;
215
+ }
216
+
217
+ // Otherwise, try to incorporate naturally
218
+ const parts = currentTitle.split(' - ');
219
+ if (parts.length >= 2) {
220
+ return `${capitalizeFirst(keyword)} | ${parts[parts.length - 1]}`;
221
+ }
222
+
223
+ return `${capitalizeFirst(keyword)} | ${currentTitle}`.substring(0, 60);
224
+ }
225
+
226
+ function generateH1WithKeyword(currentH1: string | undefined, keyword: string): string {
227
+ if (!currentH1) {
228
+ return capitalizeFirst(keyword);
229
+ }
230
+
231
+ // Try to incorporate keyword naturally
232
+ return `${capitalizeFirst(keyword)} - ${currentH1}`;
233
+ }
234
+
235
+ function generateDescriptionWithKeyword(currentDesc: string, keyword: string): string {
236
+ // If description doesn't start with keyword concept, prepend it
237
+ if (!currentDesc.toLowerCase().includes(keyword.toLowerCase().split(' ')[0])) {
238
+ const newDesc = `${capitalizeFirst(keyword)}: ${currentDesc}`;
239
+ return newDesc.substring(0, 160);
240
+ }
241
+
242
+ return currentDesc;
243
+ }
244
+
245
+ function capitalizeFirst(str: string): string {
246
+ return str.charAt(0).toUpperCase() + str.slice(1);
247
+ }
248
+
249
+ function generateRecommendations(
250
+ profile: SiteProfile,
251
+ quickWins: KeywordOpportunity[],
252
+ mediumTerm: KeywordOpportunity[]
253
+ ): string[] {
254
+ const recommendations: string[] = [];
255
+
256
+ // Domain age specific recommendations
257
+ if (profile.domainAge === 'new') {
258
+ recommendations.push(
259
+ 'Focus on quick-win keywords (KD < 15) first to build initial traffic and authority.'
260
+ );
261
+ recommendations.push(
262
+ 'Consider creating long-form, comprehensive content to establish topical authority.'
263
+ );
264
+ }
265
+
266
+ // Backlink recommendations
267
+ if (profile.backlinkCount === 'none' || profile.backlinkCount === 'few') {
268
+ recommendations.push(
269
+ 'Prioritize building quality backlinks to increase your ranking potential for competitive keywords.'
270
+ );
271
+ }
272
+
273
+ // Content recommendations based on capacity
274
+ if (profile.contentCapacity === 'low' && quickWins.length > 2) {
275
+ recommendations.push(
276
+ `Focus on your top ${Math.min(2, quickWins.length)} quick-win keywords first, then expand.`
277
+ );
278
+ }
279
+
280
+ // Quick wins available
281
+ if (quickWins.length > 0) {
282
+ recommendations.push(
283
+ `You have ${quickWins.length} quick-win keyword opportunities that you can rank for relatively quickly.`
284
+ );
285
+ }
286
+
287
+ // Medium-term opportunities
288
+ if (mediumTerm.length > 0 && profile.domainAge !== 'new') {
289
+ recommendations.push(
290
+ `${mediumTerm.length} medium-difficulty keywords are within reach with focused content and some link building.`
291
+ );
292
+ }
293
+
294
+ return recommendations;
295
+ }
@@ -0,0 +1,342 @@
1
+ /**
2
+ * Site Crawler for Keyword Research
3
+ *
4
+ * Crawls an entire site to collect text content for AI analysis.
5
+ * Used to understand what the site does and generate relevant keywords.
6
+ */
7
+
8
+ import * as cheerio from 'cheerio';
9
+ import { httpGet } from '../utils/http.js';
10
+
11
+ export interface CrawledPage {
12
+ url: string;
13
+ title: string;
14
+ description: string;
15
+ h1: string;
16
+ h2s: string[];
17
+ mainContent: string;
18
+ wordCount: number;
19
+ internalLinks: string[];
20
+ isProductPage: boolean;
21
+ isPricingPage: boolean;
22
+ isBlogPost: boolean;
23
+ isFeaturePage: boolean;
24
+ }
25
+
26
+ export interface SiteCrawlResult {
27
+ domain: string;
28
+ pages: CrawledPage[];
29
+ aggregatedContent: string;
30
+ totalWordCount: number;
31
+ uniqueHeadings: string[];
32
+ detectedPageTypes: {
33
+ product: number;
34
+ pricing: number;
35
+ blog: number;
36
+ feature: number;
37
+ other: number;
38
+ };
39
+ crawlStats: {
40
+ attempted: number;
41
+ succeeded: number;
42
+ failed: number;
43
+ duration: number;
44
+ };
45
+ }
46
+
47
+ const EXCLUDED_PATHS = [
48
+ '/cdn-cgi/',
49
+ '/wp-admin/',
50
+ '/wp-includes/',
51
+ '/wp-json/',
52
+ '/api/',
53
+ '/admin/',
54
+ '/_next/',
55
+ '/static/',
56
+ '/assets/',
57
+ '.xml',
58
+ '.json',
59
+ '.pdf',
60
+ '.zip',
61
+ '.png',
62
+ '.jpg',
63
+ '.jpeg',
64
+ '.gif',
65
+ '.svg',
66
+ '.ico',
67
+ '.css',
68
+ '.js',
69
+ ];
70
+
71
+ const PAGE_TYPE_PATTERNS = {
72
+ product: ['/product', '/features', '/solutions', '/platform', '/tour'],
73
+ pricing: ['/pricing', '/plans', '/subscription'],
74
+ blog: ['/blog', '/news', '/articles', '/posts', '/journal'],
75
+ feature: ['/feature', '/capability', '/integration'],
76
+ };
77
+
78
+ /**
79
+ * Crawl an entire site to collect text content
80
+ */
81
+ export async function crawlSite(
82
+ startUrl: string,
83
+ options: {
84
+ maxPages?: number;
85
+ maxDepth?: number;
86
+ timeout?: number;
87
+ } = {}
88
+ ): Promise<SiteCrawlResult> {
89
+ const { maxPages = 30, maxDepth = 3, timeout = 10000 } = options;
90
+
91
+ const startTime = Date.now();
92
+ const parsedUrl = new URL(startUrl);
93
+ const domain = parsedUrl.hostname;
94
+ const baseUrl = `${parsedUrl.protocol}//${domain}`;
95
+
96
+ const visited = new Set<string>();
97
+ const toVisit: Array<{ url: string; depth: number }> = [{ url: startUrl, depth: 0 }];
98
+ const pages: CrawledPage[] = [];
99
+ let failed = 0;
100
+
101
+ console.log(`🕷️ Crawling ${domain} (max ${maxPages} pages)...`);
102
+
103
+ while (toVisit.length > 0 && pages.length < maxPages) {
104
+ const { url, depth } = toVisit.shift()!;
105
+
106
+ // Normalize URL
107
+ const normalizedUrl = normalizeUrl(url, baseUrl);
108
+ if (!normalizedUrl || visited.has(normalizedUrl)) continue;
109
+
110
+ // Skip excluded paths
111
+ if (EXCLUDED_PATHS.some((p) => normalizedUrl.includes(p))) continue;
112
+
113
+ // Only crawl same domain
114
+ try {
115
+ const urlObj = new URL(normalizedUrl);
116
+ if (urlObj.hostname !== domain) continue;
117
+ } catch {
118
+ continue;
119
+ }
120
+
121
+ visited.add(normalizedUrl);
122
+
123
+ try {
124
+ const page = await crawlPage(normalizedUrl, timeout);
125
+ pages.push(page);
126
+
127
+ // Add internal links to queue (if not at max depth)
128
+ if (depth < maxDepth) {
129
+ for (const link of page.internalLinks) {
130
+ const normalizedLink = normalizeUrl(link, baseUrl);
131
+ if (normalizedLink && !visited.has(normalizedLink)) {
132
+ toVisit.push({ url: normalizedLink, depth: depth + 1 });
133
+ }
134
+ }
135
+ }
136
+
137
+ // Progress indicator
138
+ if (pages.length % 5 === 0) {
139
+ console.log(` Crawled ${pages.length} pages...`);
140
+ }
141
+ } catch (error) {
142
+ failed++;
143
+ }
144
+ }
145
+
146
+ const duration = Date.now() - startTime;
147
+
148
+ // Aggregate content
149
+ const aggregatedContent = pages
150
+ .map((p) => `# ${p.title}\n${p.description}\n${p.mainContent}`)
151
+ .join('\n\n---\n\n');
152
+
153
+ const totalWordCount = pages.reduce((sum, p) => sum + p.wordCount, 0);
154
+
155
+ // Collect unique headings
156
+ const allHeadings = new Set<string>();
157
+ pages.forEach((p) => {
158
+ if (p.h1) allHeadings.add(p.h1);
159
+ p.h2s.forEach((h2) => allHeadings.add(h2));
160
+ });
161
+
162
+ // Count page types
163
+ const detectedPageTypes = {
164
+ product: pages.filter((p) => p.isProductPage).length,
165
+ pricing: pages.filter((p) => p.isPricingPage).length,
166
+ blog: pages.filter((p) => p.isBlogPost).length,
167
+ feature: pages.filter((p) => p.isFeaturePage).length,
168
+ other: pages.filter(
169
+ (p) => !p.isProductPage && !p.isPricingPage && !p.isBlogPost && !p.isFeaturePage
170
+ ).length,
171
+ };
172
+
173
+ console.log(`✅ Crawled ${pages.length} pages in ${(duration / 1000).toFixed(1)}s`);
174
+
175
+ return {
176
+ domain,
177
+ pages,
178
+ aggregatedContent,
179
+ totalWordCount,
180
+ uniqueHeadings: Array.from(allHeadings),
181
+ detectedPageTypes,
182
+ crawlStats: {
183
+ attempted: visited.size,
184
+ succeeded: pages.length,
185
+ failed,
186
+ duration,
187
+ },
188
+ };
189
+ }
190
+
191
+ /**
192
+ * Crawl a single page
193
+ */
194
+ async function crawlPage(url: string, timeout: number): Promise<CrawledPage> {
195
+ const response = await httpGet<string>(url, {
196
+ timeout,
197
+ validateStatus: (status) => status === 200,
198
+ });
199
+
200
+ const html = response.data;
201
+ const $ = cheerio.load(html);
202
+
203
+ // Remove non-content elements
204
+ $('script, style, noscript, iframe, nav, footer, header, aside, [role="navigation"]').remove();
205
+
206
+ // Extract content
207
+ const title = $('title').text().trim();
208
+ const description = $('meta[name="description"]').attr('content')?.trim() || '';
209
+ const h1 = $('h1').first().text().trim();
210
+ const h2s = $('h2')
211
+ .map((_, el) => $(el).text().trim())
212
+ .get()
213
+ .filter((h) => h.length > 0);
214
+
215
+ // Get main content
216
+ const mainSelectors = ['main', 'article', '[role="main"]', '.content', '#content', '.post-content'];
217
+ let mainContent = '';
218
+
219
+ for (const selector of mainSelectors) {
220
+ const el = $(selector);
221
+ if (el.length > 0) {
222
+ mainContent = el.text().trim();
223
+ break;
224
+ }
225
+ }
226
+
227
+ // Fallback to body
228
+ if (!mainContent) {
229
+ mainContent = $('body').text().trim();
230
+ }
231
+
232
+ // Clean up whitespace
233
+ mainContent = mainContent
234
+ .replace(/\s+/g, ' ')
235
+ .replace(/\n+/g, '\n')
236
+ .trim();
237
+
238
+ // Limit content length for AI processing
239
+ if (mainContent.length > 5000) {
240
+ mainContent = mainContent.substring(0, 5000) + '...';
241
+ }
242
+
243
+ const wordCount = mainContent.split(/\s+/).length;
244
+
245
+ // Extract internal links
246
+ const internalLinks: string[] = [];
247
+ $('a[href]').each((_, el) => {
248
+ const href = $(el).attr('href');
249
+ if (href && !href.startsWith('mailto:') && !href.startsWith('tel:') && !href.startsWith('#')) {
250
+ internalLinks.push(href);
251
+ }
252
+ });
253
+
254
+ // Detect page type
255
+ const urlLower = url.toLowerCase();
256
+ const isProductPage = PAGE_TYPE_PATTERNS.product.some((p) => urlLower.includes(p));
257
+ const isPricingPage = PAGE_TYPE_PATTERNS.pricing.some((p) => urlLower.includes(p));
258
+ const isBlogPost = PAGE_TYPE_PATTERNS.blog.some((p) => urlLower.includes(p));
259
+ const isFeaturePage = PAGE_TYPE_PATTERNS.feature.some((p) => urlLower.includes(p));
260
+
261
+ return {
262
+ url,
263
+ title,
264
+ description,
265
+ h1,
266
+ h2s,
267
+ mainContent,
268
+ wordCount,
269
+ internalLinks,
270
+ isProductPage,
271
+ isPricingPage,
272
+ isBlogPost,
273
+ isFeaturePage,
274
+ };
275
+ }
276
+
277
+ /**
278
+ * Normalize a URL
279
+ */
280
+ function normalizeUrl(url: string, baseUrl: string): string | null {
281
+ try {
282
+ // Handle relative URLs
283
+ const absoluteUrl = url.startsWith('http') ? url : new URL(url, baseUrl).href;
284
+
285
+ // Remove hash and trailing slash
286
+ const urlObj = new URL(absoluteUrl);
287
+ urlObj.hash = '';
288
+ let normalized = urlObj.href;
289
+
290
+ // Remove trailing slash (except for root)
291
+ if (normalized.endsWith('/') && normalized !== `${urlObj.origin}/`) {
292
+ normalized = normalized.slice(0, -1);
293
+ }
294
+
295
+ return normalized;
296
+ } catch {
297
+ return null;
298
+ }
299
+ }
300
+
301
+ /**
302
+ * Extract key phrases from crawled content
303
+ */
304
+ export function extractKeyPhrases(crawlResult: SiteCrawlResult): string[] {
305
+ const phrases = new Set<string>();
306
+
307
+ // From titles
308
+ crawlResult.pages.forEach((page) => {
309
+ if (page.title) {
310
+ // Split on common separators
311
+ const parts = page.title.split(/[|\-–—:]/).map((p) => p.trim());
312
+ parts.forEach((part) => {
313
+ if (part.length > 3 && part.length < 50) {
314
+ phrases.add(part.toLowerCase());
315
+ }
316
+ });
317
+ }
318
+ });
319
+
320
+ // From headings
321
+ crawlResult.uniqueHeadings.forEach((heading) => {
322
+ if (heading.length > 3 && heading.length < 50) {
323
+ phrases.add(heading.toLowerCase());
324
+ }
325
+ });
326
+
327
+ // From descriptions
328
+ crawlResult.pages.forEach((page) => {
329
+ if (page.description) {
330
+ // Extract noun phrases (simplified - just multi-word sequences)
331
+ const words = page.description.toLowerCase().split(/\s+/);
332
+ for (let i = 0; i < words.length - 1; i++) {
333
+ const bigram = `${words[i]} ${words[i + 1]}`;
334
+ if (bigram.length > 5 && bigram.length < 40) {
335
+ phrases.add(bigram);
336
+ }
337
+ }
338
+ }
339
+ });
340
+
341
+ return Array.from(phrases).slice(0, 100);
342
+ }