searchsocket 0.3.2 → 0.4.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,4 +1,4 @@
1
- import { R as ResolvedSearchSocketConfig, b as SearchSocketConfig } from './types-BrG6XTUU.js';
1
+ import { R as ResolvedSearchSocketConfig, d as SearchSocketConfig } from './types-z2dw3H6E.js';
2
2
 
3
3
  interface SearchSocketHandleOptions {
4
4
  configPath?: string;
package/dist/sveltekit.js CHANGED
@@ -5009,32 +5009,32 @@ var require_URL = __commonJS({
5009
5009
  else
5010
5010
  return basepath.substring(0, lastslash + 1) + refpath;
5011
5011
  }
5012
- function remove_dot_segments(path14) {
5013
- if (!path14) return path14;
5012
+ function remove_dot_segments(path15) {
5013
+ if (!path15) return path15;
5014
5014
  var output = "";
5015
- while (path14.length > 0) {
5016
- if (path14 === "." || path14 === "..") {
5017
- path14 = "";
5015
+ while (path15.length > 0) {
5016
+ if (path15 === "." || path15 === "..") {
5017
+ path15 = "";
5018
5018
  break;
5019
5019
  }
5020
- var twochars = path14.substring(0, 2);
5021
- var threechars = path14.substring(0, 3);
5022
- var fourchars = path14.substring(0, 4);
5020
+ var twochars = path15.substring(0, 2);
5021
+ var threechars = path15.substring(0, 3);
5022
+ var fourchars = path15.substring(0, 4);
5023
5023
  if (threechars === "../") {
5024
- path14 = path14.substring(3);
5024
+ path15 = path15.substring(3);
5025
5025
  } else if (twochars === "./") {
5026
- path14 = path14.substring(2);
5026
+ path15 = path15.substring(2);
5027
5027
  } else if (threechars === "/./") {
5028
- path14 = "/" + path14.substring(3);
5029
- } else if (twochars === "/." && path14.length === 2) {
5030
- path14 = "/";
5031
- } else if (fourchars === "/../" || threechars === "/.." && path14.length === 3) {
5032
- path14 = "/" + path14.substring(4);
5028
+ path15 = "/" + path15.substring(3);
5029
+ } else if (twochars === "/." && path15.length === 2) {
5030
+ path15 = "/";
5031
+ } else if (fourchars === "/../" || threechars === "/.." && path15.length === 3) {
5032
+ path15 = "/" + path15.substring(4);
5033
5033
  output = output.replace(/\/?[^\/]*$/, "");
5034
5034
  } else {
5035
- var segment = path14.match(/(\/?([^\/]*))/)[0];
5035
+ var segment = path15.match(/(\/?([^\/]*))/)[0];
5036
5036
  output += segment;
5037
- path14 = path14.substring(segment.length);
5037
+ path15 = path15.substring(segment.length);
5038
5038
  }
5039
5039
  }
5040
5040
  return output;
@@ -16598,6 +16598,8 @@ var searchSocketConfigSchema = z.object({
16598
16598
  envVar: z.string().min(1).optional(),
16599
16599
  sanitize: z.boolean().optional()
16600
16600
  }).optional(),
16601
+ exclude: z.array(z.string()).optional(),
16602
+ respectRobotsTxt: z.boolean().optional(),
16601
16603
  source: z.object({
16602
16604
  mode: z.enum(["static-output", "crawl", "content-files", "build"]).optional(),
16603
16605
  staticOutputDir: z.string().min(1).optional(),
@@ -16728,6 +16730,8 @@ function createDefaultConfig(projectId) {
16728
16730
  envVar: "SEARCHSOCKET_SCOPE",
16729
16731
  sanitize: true
16730
16732
  },
16733
+ exclude: [],
16734
+ respectRobotsTxt: true,
16731
16735
  source: {
16732
16736
  mode: "static-output",
16733
16737
  staticOutputDir: "build",
@@ -16758,7 +16762,7 @@ function createDefaultConfig(projectId) {
16758
16762
  },
16759
16763
  embeddings: {
16760
16764
  provider: "jina",
16761
- model: "jina-embeddings-v3",
16765
+ model: "jina-embeddings-v5-text-small",
16762
16766
  apiKeyEnv: "JINA_API_KEY",
16763
16767
  batchSize: 64,
16764
16768
  concurrency: 4
@@ -16771,9 +16775,9 @@ function createDefaultConfig(projectId) {
16771
16775
  }
16772
16776
  },
16773
16777
  rerank: {
16774
- enabled: false,
16778
+ enabled: true,
16775
16779
  topN: 20,
16776
- model: "jina-reranker-v2-base-multilingual"
16780
+ model: "jina-reranker-v3"
16777
16781
  },
16778
16782
  ranking: {
16779
16783
  enableIncomingLinkBoost: true,
@@ -16892,6 +16896,8 @@ ${issues}`
16892
16896
  ...defaults.scope,
16893
16897
  ...parsed.scope
16894
16898
  },
16899
+ exclude: parsed.exclude ?? defaults.exclude,
16900
+ respectRobotsTxt: parsed.respectRobotsTxt ?? defaults.respectRobotsTxt,
16895
16901
  source: {
16896
16902
  ...defaults.source,
16897
16903
  ...parsed.source,
@@ -17242,7 +17248,7 @@ var JinaReranker = class {
17242
17248
  constructor(options) {
17243
17249
  this.apiKey = options.apiKey;
17244
17250
  this.model = options.model;
17245
- this.maxRetries = options.maxRetries ?? 4;
17251
+ this.maxRetries = options.maxRetries ?? 2;
17246
17252
  }
17247
17253
  async rerank(query, candidates, topN) {
17248
17254
  if (candidates.length === 0) {
@@ -17252,7 +17258,8 @@ var JinaReranker = class {
17252
17258
  model: this.model,
17253
17259
  query,
17254
17260
  documents: candidates.map((candidate) => candidate.text),
17255
- top_n: topN ?? candidates.length
17261
+ top_n: topN ?? candidates.length,
17262
+ return_documents: false
17256
17263
  };
17257
17264
  let attempt = 0;
17258
17265
  while (attempt <= this.maxRetries) {
@@ -17855,6 +17862,36 @@ async function createVectorStore(config, cwd) {
17855
17862
  });
17856
17863
  }
17857
17864
 
17865
+ // src/utils/pattern.ts
17866
+ function matchUrlPattern(url, pattern) {
17867
+ const norm = (p) => p !== "/" && p.endsWith("/") ? p.slice(0, -1) : p;
17868
+ const normalizedUrl = norm(url);
17869
+ const normalizedPattern = norm(pattern);
17870
+ if (normalizedPattern.endsWith("/**")) {
17871
+ const prefix = normalizedPattern.slice(0, -3);
17872
+ if (prefix === "") {
17873
+ return true;
17874
+ }
17875
+ return normalizedUrl === prefix || normalizedUrl.startsWith(prefix + "/");
17876
+ }
17877
+ if (normalizedPattern.endsWith("/*")) {
17878
+ const prefix = normalizedPattern.slice(0, -2);
17879
+ if (prefix === "") {
17880
+ return normalizedUrl !== "/" && !normalizedUrl.slice(1).includes("/");
17881
+ }
17882
+ if (!normalizedUrl.startsWith(prefix + "/")) return false;
17883
+ const rest = normalizedUrl.slice(prefix.length + 1);
17884
+ return rest.length > 0 && !rest.includes("/");
17885
+ }
17886
+ return normalizedUrl === normalizedPattern;
17887
+ }
17888
+ function matchUrlPatterns(url, patterns) {
17889
+ for (const pattern of patterns) {
17890
+ if (matchUrlPattern(url, pattern)) return true;
17891
+ }
17892
+ return false;
17893
+ }
17894
+
17858
17895
  // src/search/ranking.ts
17859
17896
  function nonNegativeOrZero(value) {
17860
17897
  if (!Number.isFinite(value)) {
@@ -17883,21 +17920,11 @@ function rankHits(hits, config) {
17883
17920
  });
17884
17921
  }
17885
17922
  function findPageWeight(url, pageWeights) {
17886
- const norm = (p) => p !== "/" && p.endsWith("/") ? p.slice(0, -1) : p;
17887
- const normalizedUrl = norm(url);
17888
- for (const [pattern, weight] of Object.entries(pageWeights)) {
17889
- if (norm(pattern) === normalizedUrl) {
17890
- return weight;
17891
- }
17892
- }
17893
- let bestPrefix = "";
17923
+ let bestPattern = "";
17894
17924
  let bestWeight = 1;
17895
17925
  for (const [pattern, weight] of Object.entries(pageWeights)) {
17896
- const normalizedPattern = norm(pattern);
17897
- if (normalizedPattern === "/") continue;
17898
- const prefix = `${normalizedPattern}/`;
17899
- if (normalizedUrl.startsWith(prefix) && prefix.length > bestPrefix.length) {
17900
- bestPrefix = prefix;
17926
+ if (matchUrlPattern(url, pattern) && pattern.length > bestPattern.length) {
17927
+ bestPattern = pattern;
17901
17928
  bestWeight = weight;
17902
17929
  }
17903
17930
  }
@@ -17955,7 +17982,8 @@ var requestSchema = z.object({
17955
17982
  pathPrefix: z.string().optional(),
17956
17983
  tags: z.array(z.string()).optional(),
17957
17984
  rerank: z.boolean().optional(),
17958
- groupBy: z.enum(["page", "chunk"]).optional()
17985
+ groupBy: z.enum(["page", "chunk"]).optional(),
17986
+ stream: z.boolean().optional()
17959
17987
  });
17960
17988
  var SearchEngine = class _SearchEngine {
17961
17989
  cwd;
@@ -18028,7 +18056,103 @@ var SearchEngine = class _SearchEngine {
18028
18056
  rerankMs = hrTimeMs(rerankStart);
18029
18057
  usedRerank = true;
18030
18058
  }
18031
- let results;
18059
+ const results = this.buildResults(ordered, topK, groupByPage);
18060
+ return {
18061
+ q: input.q,
18062
+ scope: resolvedScope.scopeName,
18063
+ results,
18064
+ meta: {
18065
+ timingsMs: {
18066
+ embed: Math.round(embedMs),
18067
+ vector: Math.round(vectorMs),
18068
+ rerank: Math.round(rerankMs),
18069
+ total: Math.round(hrTimeMs(totalStart))
18070
+ },
18071
+ usedRerank,
18072
+ modelId: this.config.embeddings.model
18073
+ }
18074
+ };
18075
+ }
18076
+ async *searchStreaming(request) {
18077
+ const parsed = requestSchema.safeParse(request);
18078
+ if (!parsed.success) {
18079
+ throw new SearchSocketError("INVALID_REQUEST", parsed.error.issues[0]?.message ?? "Invalid request", 400);
18080
+ }
18081
+ const input = parsed.data;
18082
+ const wantsRerank = Boolean(input.rerank);
18083
+ if (!wantsRerank) {
18084
+ const response = await this.search(request);
18085
+ yield { phase: "initial", data: response };
18086
+ return;
18087
+ }
18088
+ const totalStart = process.hrtime.bigint();
18089
+ const resolvedScope = resolveScope(this.config, input.scope);
18090
+ await this.assertModelCompatibility(resolvedScope);
18091
+ const topK = input.topK ?? 10;
18092
+ const groupByPage = (input.groupBy ?? "page") === "page";
18093
+ const candidateK = groupByPage ? Math.max(topK * 10, 50) : Math.max(50, topK);
18094
+ const embedStart = process.hrtime.bigint();
18095
+ const queryEmbeddings = await this.embeddings.embedTexts([input.q], this.config.embeddings.model, "retrieval.query");
18096
+ const queryVector = queryEmbeddings[0];
18097
+ if (!queryVector || queryVector.length === 0 || queryVector.some((value) => !Number.isFinite(value))) {
18098
+ throw new SearchSocketError("VECTOR_BACKEND_UNAVAILABLE", "Unable to create query embedding.");
18099
+ }
18100
+ const embedMs = hrTimeMs(embedStart);
18101
+ const vectorStart = process.hrtime.bigint();
18102
+ const hits = await this.vectorStore.query(
18103
+ queryVector,
18104
+ {
18105
+ topK: candidateK,
18106
+ pathPrefix: input.pathPrefix,
18107
+ tags: input.tags
18108
+ },
18109
+ resolvedScope
18110
+ );
18111
+ const vectorMs = hrTimeMs(vectorStart);
18112
+ const ranked = rankHits(hits, this.config);
18113
+ const initialResults = this.buildResults(ranked, topK, groupByPage);
18114
+ yield {
18115
+ phase: "initial",
18116
+ data: {
18117
+ q: input.q,
18118
+ scope: resolvedScope.scopeName,
18119
+ results: initialResults,
18120
+ meta: {
18121
+ timingsMs: {
18122
+ embed: Math.round(embedMs),
18123
+ vector: Math.round(vectorMs),
18124
+ rerank: 0,
18125
+ total: Math.round(hrTimeMs(totalStart))
18126
+ },
18127
+ usedRerank: false,
18128
+ modelId: this.config.embeddings.model
18129
+ }
18130
+ }
18131
+ };
18132
+ const rerankStart = process.hrtime.bigint();
18133
+ const reranked = await this.rerankHits(input.q, ranked, topK);
18134
+ const rerankMs = hrTimeMs(rerankStart);
18135
+ const rerankedResults = this.buildResults(reranked, topK, groupByPage);
18136
+ yield {
18137
+ phase: "reranked",
18138
+ data: {
18139
+ q: input.q,
18140
+ scope: resolvedScope.scopeName,
18141
+ results: rerankedResults,
18142
+ meta: {
18143
+ timingsMs: {
18144
+ embed: Math.round(embedMs),
18145
+ vector: Math.round(vectorMs),
18146
+ rerank: Math.round(rerankMs),
18147
+ total: Math.round(hrTimeMs(totalStart))
18148
+ },
18149
+ usedRerank: true,
18150
+ modelId: this.config.embeddings.model
18151
+ }
18152
+ }
18153
+ };
18154
+ }
18155
+ buildResults(ordered, topK, groupByPage) {
18032
18156
  const minScore = this.config.ranking.minScore;
18033
18157
  if (groupByPage) {
18034
18158
  let pages = aggregateByPage(ordered, this.config);
@@ -18036,10 +18160,10 @@ var SearchEngine = class _SearchEngine {
18036
18160
  pages = pages.filter((p) => p.pageScore >= minScore);
18037
18161
  }
18038
18162
  const minRatio = this.config.ranking.minChunkScoreRatio;
18039
- results = pages.slice(0, topK).map((page) => {
18163
+ return pages.slice(0, topK).map((page) => {
18040
18164
  const bestScore = page.bestChunk.finalScore;
18041
- const minScore2 = Number.isFinite(bestScore) ? bestScore * minRatio : Number.NEGATIVE_INFINITY;
18042
- const meaningful = page.matchingChunks.filter((c) => c.finalScore >= minScore2).slice(0, 5);
18165
+ const minChunkScore = Number.isFinite(bestScore) ? bestScore * minRatio : Number.NEGATIVE_INFINITY;
18166
+ const meaningful = page.matchingChunks.filter((c) => c.finalScore >= minChunkScore).slice(0, 5);
18043
18167
  return {
18044
18168
  url: page.url,
18045
18169
  title: page.title,
@@ -18056,10 +18180,11 @@ var SearchEngine = class _SearchEngine {
18056
18180
  };
18057
18181
  });
18058
18182
  } else {
18183
+ let filtered = ordered;
18059
18184
  if (minScore > 0) {
18060
- ordered = ordered.filter((entry) => entry.finalScore >= minScore);
18185
+ filtered = ordered.filter((entry) => entry.finalScore >= minScore);
18061
18186
  }
18062
- results = ordered.slice(0, topK).map(({ hit, finalScore }) => ({
18187
+ return filtered.slice(0, topK).map(({ hit, finalScore }) => ({
18063
18188
  url: hit.metadata.url,
18064
18189
  title: hit.metadata.title,
18065
18190
  sectionTitle: hit.metadata.sectionTitle || void 0,
@@ -18068,21 +18193,6 @@ var SearchEngine = class _SearchEngine {
18068
18193
  routeFile: hit.metadata.routeFile
18069
18194
  }));
18070
18195
  }
18071
- return {
18072
- q: input.q,
18073
- scope: resolvedScope.scopeName,
18074
- results,
18075
- meta: {
18076
- timingsMs: {
18077
- embed: Math.round(embedMs),
18078
- vector: Math.round(vectorMs),
18079
- rerank: Math.round(rerankMs),
18080
- total: Math.round(hrTimeMs(totalStart))
18081
- },
18082
- usedRerank,
18083
- modelId: this.config.embeddings.model
18084
- }
18085
- };
18086
18196
  }
18087
18197
  async getPage(pathOrUrl, scope) {
18088
18198
  const resolvedScope = resolveScope(this.config, scope);
@@ -18154,6 +18264,7 @@ var SearchEngine = class _SearchEngine {
18154
18264
  const MAX_CHUNKS_PER_PAGE = 5;
18155
18265
  const MIN_CHUNKS_PER_PAGE = 1;
18156
18266
  const MIN_CHUNK_SCORE_RATIO = 0.5;
18267
+ const MAX_DOC_CHARS = 2e3;
18157
18268
  const pageCandidates = [];
18158
18269
  for (const [url, chunks] of pageGroups) {
18159
18270
  const byScore = [...chunks].sort((a, b) => b.finalScore - a.finalScore);
@@ -18173,12 +18284,18 @@ var SearchEngine = class _SearchEngine {
18173
18284
  }
18174
18285
  const body = selected.map((c) => c.hit.metadata.chunkText || c.hit.metadata.snippet).join("\n\n");
18175
18286
  parts.push(body);
18176
- pageCandidates.push({ id: url, text: parts.join("\n\n") });
18287
+ let text = parts.join("\n\n");
18288
+ if (text.length > MAX_DOC_CHARS) {
18289
+ text = text.slice(0, MAX_DOC_CHARS);
18290
+ }
18291
+ pageCandidates.push({ id: url, text });
18177
18292
  }
18293
+ const maxCandidates = Math.max(topK, this.config.rerank.topN);
18294
+ const cappedCandidates = pageCandidates.slice(0, maxCandidates);
18178
18295
  const reranked = await this.reranker.rerank(
18179
18296
  query,
18180
- pageCandidates,
18181
- Math.max(topK, this.config.rerank.topN)
18297
+ cappedCandidates,
18298
+ maxCandidates
18182
18299
  );
18183
18300
  const scoreByUrl = new Map(reranked.map((e) => [e.id, e.score]));
18184
18301
  return ranked.map((entry) => {
@@ -18350,7 +18467,44 @@ function searchsocketHandle(options = {}) {
18350
18467
  throw new SearchSocketError("INVALID_REQUEST", "Malformed JSON request body", 400);
18351
18468
  }
18352
18469
  const engine = await getEngine();
18353
- const result = await engine.search(body);
18470
+ const searchRequest = body;
18471
+ if (searchRequest.stream && searchRequest.rerank) {
18472
+ const encoder = new TextEncoder();
18473
+ const stream = new ReadableStream({
18474
+ async start(controller) {
18475
+ try {
18476
+ for await (const event2 of engine.searchStreaming(searchRequest)) {
18477
+ const line = JSON.stringify(event2) + "\n";
18478
+ controller.enqueue(encoder.encode(line));
18479
+ }
18480
+ } catch (streamError) {
18481
+ const errorEvent = {
18482
+ phase: "error",
18483
+ data: {
18484
+ error: {
18485
+ code: streamError instanceof SearchSocketError ? streamError.code : "INTERNAL_ERROR",
18486
+ message: streamError instanceof Error ? streamError.message : "Unknown error"
18487
+ }
18488
+ }
18489
+ };
18490
+ controller.enqueue(encoder.encode(JSON.stringify(errorEvent) + "\n"));
18491
+ } finally {
18492
+ controller.close();
18493
+ }
18494
+ }
18495
+ });
18496
+ return withCors(
18497
+ new Response(stream, {
18498
+ status: 200,
18499
+ headers: {
18500
+ "content-type": "application/x-ndjson"
18501
+ }
18502
+ }),
18503
+ event.request,
18504
+ config
18505
+ );
18506
+ }
18507
+ const result = await engine.search(searchRequest);
18354
18508
  return withCors(
18355
18509
  new Response(JSON.stringify(result), {
18356
18510
  status: 200,
@@ -19579,6 +19733,17 @@ function extractFromHtml(url, html, config) {
19579
19733
  if ($(`[${config.extract.noindexAttr}]`).length > 0) {
19580
19734
  return null;
19581
19735
  }
19736
+ const weightRaw = $("meta[name='searchsocket-weight']").attr("content")?.trim();
19737
+ let weight;
19738
+ if (weightRaw !== void 0) {
19739
+ const parsed = Number(weightRaw);
19740
+ if (Number.isFinite(parsed) && parsed >= 0) {
19741
+ weight = parsed;
19742
+ }
19743
+ }
19744
+ if (weight === 0) {
19745
+ return null;
19746
+ }
19582
19747
  const description = $("meta[name='description']").attr("content")?.trim() || $("meta[property='og:description']").attr("content")?.trim() || void 0;
19583
19748
  const keywordsRaw = $("meta[name='keywords']").attr("content")?.trim();
19584
19749
  const keywords = keywordsRaw ? keywordsRaw.split(",").map((k) => k.trim()).filter(Boolean) : void 0;
@@ -19634,7 +19799,8 @@ function extractFromHtml(url, html, config) {
19634
19799
  noindex: false,
19635
19800
  tags,
19636
19801
  description,
19637
- keywords
19802
+ keywords,
19803
+ weight
19638
19804
  };
19639
19805
  }
19640
19806
  function extractFromMarkdown(url, markdown, title) {
@@ -19647,6 +19813,14 @@ function extractFromMarkdown(url, markdown, title) {
19647
19813
  if (frontmatter.noindex === true || searchsocketMeta?.noindex === true) {
19648
19814
  return null;
19649
19815
  }
19816
+ let mdWeight;
19817
+ const rawWeight = searchsocketMeta?.weight ?? frontmatter.searchsocketWeight;
19818
+ if (typeof rawWeight === "number" && Number.isFinite(rawWeight) && rawWeight >= 0) {
19819
+ mdWeight = rawWeight;
19820
+ }
19821
+ if (mdWeight === 0) {
19822
+ return null;
19823
+ }
19650
19824
  const content = parsed.content;
19651
19825
  const normalized = normalizeMarkdown(content);
19652
19826
  if (!normalizeText(normalized)) {
@@ -19669,7 +19843,8 @@ function extractFromMarkdown(url, markdown, title) {
19669
19843
  noindex: false,
19670
19844
  tags: normalizeUrlPath(url).split("/").filter(Boolean).slice(0, 1),
19671
19845
  description: fmDescription,
19672
- keywords: fmKeywords
19846
+ keywords: fmKeywords,
19847
+ weight: mdWeight
19673
19848
  };
19674
19849
  }
19675
19850
  function yamlString(value) {
@@ -19938,15 +20113,7 @@ function expandDynamicUrl(url, value) {
19938
20113
  return url.replace(/\[\[?\.\.\.[^\]]+\]?\]|\[\[[^\]]+\]\]|\[[^\]]+\]/g, value);
19939
20114
  }
19940
20115
  function isExcluded(url, patterns) {
19941
- for (const pattern of patterns) {
19942
- if (pattern.endsWith("/*")) {
19943
- const prefix = pattern.slice(0, -1);
19944
- if (url.startsWith(prefix) || url === prefix.slice(0, -1)) return true;
19945
- } else if (url === pattern) {
19946
- return true;
19947
- }
19948
- }
19949
- return false;
20116
+ return matchUrlPatterns(url, patterns);
19950
20117
  }
19951
20118
  function findFreePort() {
19952
20119
  return new Promise((resolve, reject) => {
@@ -20362,12 +20529,83 @@ async function loadStaticOutputPages(cwd, config, maxPages) {
20362
20529
  }
20363
20530
  return pages;
20364
20531
  }
20532
+ function parseRobotsTxt(content, userAgent = "Searchsocket") {
20533
+ const lines = content.split(/\r?\n/);
20534
+ const agentGroups = /* @__PURE__ */ new Map();
20535
+ let currentAgents = [];
20536
+ for (const rawLine of lines) {
20537
+ const line = rawLine.replace(/#.*$/, "").trim();
20538
+ if (!line) continue;
20539
+ const colonIdx = line.indexOf(":");
20540
+ if (colonIdx === -1) continue;
20541
+ const directive = line.slice(0, colonIdx).trim().toLowerCase();
20542
+ const value = line.slice(colonIdx + 1).trim();
20543
+ if (directive === "user-agent") {
20544
+ const agentName = value.toLowerCase();
20545
+ currentAgents.push(agentName);
20546
+ if (!agentGroups.has(agentName)) {
20547
+ agentGroups.set(agentName, { disallow: [], allow: [] });
20548
+ }
20549
+ } else if (directive === "disallow" && value && currentAgents.length > 0) {
20550
+ for (const agent of currentAgents) {
20551
+ agentGroups.get(agent).disallow.push(value);
20552
+ }
20553
+ } else if (directive === "allow" && value && currentAgents.length > 0) {
20554
+ for (const agent of currentAgents) {
20555
+ agentGroups.get(agent).allow.push(value);
20556
+ }
20557
+ } else if (directive !== "disallow" && directive !== "allow") {
20558
+ currentAgents = [];
20559
+ }
20560
+ }
20561
+ const specific = agentGroups.get(userAgent.toLowerCase());
20562
+ if (specific && (specific.disallow.length > 0 || specific.allow.length > 0)) {
20563
+ return specific;
20564
+ }
20565
+ return agentGroups.get("*") ?? { disallow: [], allow: [] };
20566
+ }
20567
+ function isBlockedByRobots(urlPath, rules3) {
20568
+ let longestDisallow = "";
20569
+ for (const pattern of rules3.disallow) {
20570
+ if (urlPath.startsWith(pattern) && pattern.length > longestDisallow.length) {
20571
+ longestDisallow = pattern;
20572
+ }
20573
+ }
20574
+ if (!longestDisallow) return false;
20575
+ let longestAllow = "";
20576
+ for (const pattern of rules3.allow) {
20577
+ if (urlPath.startsWith(pattern) && pattern.length > longestAllow.length) {
20578
+ longestAllow = pattern;
20579
+ }
20580
+ }
20581
+ return longestAllow.length < longestDisallow.length;
20582
+ }
20583
+ async function loadRobotsTxtFromDir(dir) {
20584
+ try {
20585
+ const content = await fs4.readFile(path.join(dir, "robots.txt"), "utf8");
20586
+ return parseRobotsTxt(content);
20587
+ } catch {
20588
+ return null;
20589
+ }
20590
+ }
20591
+ async function fetchRobotsTxt(baseUrl) {
20592
+ try {
20593
+ const url = new URL("/robots.txt", baseUrl).href;
20594
+ const response = await fetch(url);
20595
+ if (!response.ok) return null;
20596
+ const content = await response.text();
20597
+ return parseRobotsTxt(content);
20598
+ } catch {
20599
+ return null;
20600
+ }
20601
+ }
20365
20602
 
20366
20603
  // src/indexing/pipeline.ts
20367
20604
  var EMBEDDING_PRICE_PER_1K_TOKENS_USD = {
20368
- "jina-embeddings-v3": 2e-5
20605
+ "jina-embeddings-v3": 2e-5,
20606
+ "jina-embeddings-v5-text-small": 5e-5
20369
20607
  };
20370
- var DEFAULT_EMBEDDING_PRICE_PER_1K = 2e-5;
20608
+ var DEFAULT_EMBEDDING_PRICE_PER_1K = 5e-5;
20371
20609
  var IndexPipeline = class _IndexPipeline {
20372
20610
  cwd;
20373
20611
  config;
@@ -20445,6 +20683,53 @@ var IndexPipeline = class _IndexPipeline {
20445
20683
  }
20446
20684
  stageEnd("source", sourceStart);
20447
20685
  this.logger.info(`Loaded ${sourcePages.length} page${sourcePages.length === 1 ? "" : "s"} (${stageTimingsMs["source"]}ms)`);
20686
+ const filterStart = stageStart();
20687
+ let filteredSourcePages = sourcePages;
20688
+ if (this.config.exclude.length > 0) {
20689
+ const beforeExclude = filteredSourcePages.length;
20690
+ filteredSourcePages = filteredSourcePages.filter((p) => {
20691
+ const url = normalizeUrlPath(p.url);
20692
+ if (matchUrlPatterns(url, this.config.exclude)) {
20693
+ this.logger.debug(`Excluding ${url} (matched exclude pattern)`);
20694
+ return false;
20695
+ }
20696
+ return true;
20697
+ });
20698
+ const excludedCount = beforeExclude - filteredSourcePages.length;
20699
+ if (excludedCount > 0) {
20700
+ this.logger.info(`Excluded ${excludedCount} page${excludedCount === 1 ? "" : "s"} by config exclude patterns`);
20701
+ }
20702
+ }
20703
+ if (this.config.respectRobotsTxt) {
20704
+ let robotsRules = null;
20705
+ if (sourceMode === "static-output") {
20706
+ robotsRules = await loadRobotsTxtFromDir(
20707
+ path.resolve(this.cwd, this.config.source.staticOutputDir)
20708
+ );
20709
+ } else if (sourceMode === "build" && this.config.source.build) {
20710
+ robotsRules = await loadRobotsTxtFromDir(
20711
+ path.resolve(this.cwd, this.config.source.build.outputDir)
20712
+ );
20713
+ } else if (sourceMode === "crawl" && this.config.source.crawl) {
20714
+ robotsRules = await fetchRobotsTxt(this.config.source.crawl.baseUrl);
20715
+ }
20716
+ if (robotsRules) {
20717
+ const beforeRobots = filteredSourcePages.length;
20718
+ filteredSourcePages = filteredSourcePages.filter((p) => {
20719
+ const url = normalizeUrlPath(p.url);
20720
+ if (isBlockedByRobots(url, robotsRules)) {
20721
+ this.logger.debug(`Excluding ${url} (blocked by robots.txt)`);
20722
+ return false;
20723
+ }
20724
+ return true;
20725
+ });
20726
+ const robotsExcluded = beforeRobots - filteredSourcePages.length;
20727
+ if (robotsExcluded > 0) {
20728
+ this.logger.info(`Excluded ${robotsExcluded} page${robotsExcluded === 1 ? "" : "s"} by robots.txt`);
20729
+ }
20730
+ }
20731
+ }
20732
+ stageEnd("filter", filterStart);
20448
20733
  const routeStart = stageStart();
20449
20734
  const routePatterns = await buildRoutePatterns(this.cwd);
20450
20735
  stageEnd("route_map", routeStart);
@@ -20452,7 +20737,7 @@ var IndexPipeline = class _IndexPipeline {
20452
20737
  const extractStart = stageStart();
20453
20738
  this.logger.info("Extracting content...");
20454
20739
  const extractedPages = [];
20455
- for (const sourcePage of sourcePages) {
20740
+ for (const sourcePage of filteredSourcePages) {
20456
20741
  const extracted = sourcePage.html ? extractFromHtml(sourcePage.url, sourcePage.html, this.config) : extractFromMarkdown(sourcePage.url, sourcePage.markdown ?? "", sourcePage.title);
20457
20742
  if (!extracted) {
20458
20743
  this.logger.warn(
@@ -20478,16 +20763,29 @@ var IndexPipeline = class _IndexPipeline {
20478
20763
  seenUrls.add(page.url);
20479
20764
  uniquePages.push(page);
20480
20765
  }
20766
+ const indexablePages = [];
20767
+ for (const page of uniquePages) {
20768
+ const effectiveWeight = page.weight ?? findPageWeight(page.url, this.config.ranking.pageWeights);
20769
+ if (effectiveWeight === 0) {
20770
+ this.logger.debug(`Excluding ${page.url} (zero weight)`);
20771
+ continue;
20772
+ }
20773
+ indexablePages.push(page);
20774
+ }
20775
+ const zeroWeightCount = uniquePages.length - indexablePages.length;
20776
+ if (zeroWeightCount > 0) {
20777
+ this.logger.info(`Excluded ${zeroWeightCount} page${zeroWeightCount === 1 ? "" : "s"} with zero weight`);
20778
+ }
20481
20779
  stageEnd("extract", extractStart);
20482
- const skippedPages = sourcePages.length - uniquePages.length;
20483
- this.logger.info(`Extracted ${uniquePages.length} page${uniquePages.length === 1 ? "" : "s"}${skippedPages > 0 ? ` (${skippedPages} skipped)` : ""} (${stageTimingsMs["extract"]}ms)`);
20780
+ const skippedPages = filteredSourcePages.length - indexablePages.length;
20781
+ this.logger.info(`Extracted ${indexablePages.length} page${indexablePages.length === 1 ? "" : "s"}${skippedPages > 0 ? ` (${skippedPages} skipped)` : ""} (${stageTimingsMs["extract"]}ms)`);
20484
20782
  const linkStart = stageStart();
20485
- const pageSet = new Set(uniquePages.map((page) => normalizeUrlPath(page.url)));
20783
+ const pageSet = new Set(indexablePages.map((page) => normalizeUrlPath(page.url)));
20486
20784
  const incomingLinkCount = /* @__PURE__ */ new Map();
20487
- for (const page of uniquePages) {
20785
+ for (const page of indexablePages) {
20488
20786
  incomingLinkCount.set(page.url, incomingLinkCount.get(page.url) ?? 0);
20489
20787
  }
20490
- for (const page of uniquePages) {
20788
+ for (const page of indexablePages) {
20491
20789
  for (const outgoing of page.outgoingLinks) {
20492
20790
  if (!pageSet.has(outgoing)) {
20493
20791
  continue;
@@ -20511,7 +20809,7 @@ var IndexPipeline = class _IndexPipeline {
20511
20809
  });
20512
20810
  }
20513
20811
  }
20514
- for (const page of uniquePages) {
20812
+ for (const page of indexablePages) {
20515
20813
  const routeMatch = precomputedRoutes.get(normalizeUrlPath(page.url)) ?? mapUrlToRoute(page.url, routePatterns);
20516
20814
  if (routeMatch.routeResolution === "best-effort") {
20517
20815
  if (this.config.source.strictRouteMapping) {
@@ -20797,7 +21095,7 @@ function searchsocketVitePlugin(options = {}) {
20797
21095
  });
20798
21096
  const stats = await pipeline.run({
20799
21097
  changedOnly: options.changedOnly ?? true,
20800
- force: options.force ?? false,
21098
+ force: (options.force ?? false) || /^(1|true|yes)$/i.test(process.env.SEARCHSOCKET_FORCE_REINDEX ?? ""),
20801
21099
  dryRun: options.dryRun ?? false,
20802
21100
  scopeOverride: options.scope,
20803
21101
  verbose: options.verbose