searchsocket 0.3.2 → 0.4.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -5021,32 +5021,32 @@ var require_URL = __commonJS({
5021
5021
  else
5022
5022
  return basepath.substring(0, lastslash + 1) + refpath;
5023
5023
  }
5024
- function remove_dot_segments(path14) {
5025
- if (!path14) return path14;
5024
+ function remove_dot_segments(path15) {
5025
+ if (!path15) return path15;
5026
5026
  var output = "";
5027
- while (path14.length > 0) {
5028
- if (path14 === "." || path14 === "..") {
5029
- path14 = "";
5027
+ while (path15.length > 0) {
5028
+ if (path15 === "." || path15 === "..") {
5029
+ path15 = "";
5030
5030
  break;
5031
5031
  }
5032
- var twochars = path14.substring(0, 2);
5033
- var threechars = path14.substring(0, 3);
5034
- var fourchars = path14.substring(0, 4);
5032
+ var twochars = path15.substring(0, 2);
5033
+ var threechars = path15.substring(0, 3);
5034
+ var fourchars = path15.substring(0, 4);
5035
5035
  if (threechars === "../") {
5036
- path14 = path14.substring(3);
5036
+ path15 = path15.substring(3);
5037
5037
  } else if (twochars === "./") {
5038
- path14 = path14.substring(2);
5038
+ path15 = path15.substring(2);
5039
5039
  } else if (threechars === "/./") {
5040
- path14 = "/" + path14.substring(3);
5041
- } else if (twochars === "/." && path14.length === 2) {
5042
- path14 = "/";
5043
- } else if (fourchars === "/../" || threechars === "/.." && path14.length === 3) {
5044
- path14 = "/" + path14.substring(4);
5040
+ path15 = "/" + path15.substring(3);
5041
+ } else if (twochars === "/." && path15.length === 2) {
5042
+ path15 = "/";
5043
+ } else if (fourchars === "/../" || threechars === "/.." && path15.length === 3) {
5044
+ path15 = "/" + path15.substring(4);
5045
5045
  output = output.replace(/\/?[^\/]*$/, "");
5046
5046
  } else {
5047
- var segment = path14.match(/(\/?([^\/]*))/)[0];
5047
+ var segment = path15.match(/(\/?([^\/]*))/)[0];
5048
5048
  output += segment;
5049
- path14 = path14.substring(segment.length);
5049
+ path15 = path15.substring(segment.length);
5050
5050
  }
5051
5051
  }
5052
5052
  return output;
@@ -16610,6 +16610,8 @@ var searchSocketConfigSchema = zod.z.object({
16610
16610
  envVar: zod.z.string().min(1).optional(),
16611
16611
  sanitize: zod.z.boolean().optional()
16612
16612
  }).optional(),
16613
+ exclude: zod.z.array(zod.z.string()).optional(),
16614
+ respectRobotsTxt: zod.z.boolean().optional(),
16613
16615
  source: zod.z.object({
16614
16616
  mode: zod.z.enum(["static-output", "crawl", "content-files", "build"]).optional(),
16615
16617
  staticOutputDir: zod.z.string().min(1).optional(),
@@ -16740,6 +16742,8 @@ function createDefaultConfig(projectId) {
16740
16742
  envVar: "SEARCHSOCKET_SCOPE",
16741
16743
  sanitize: true
16742
16744
  },
16745
+ exclude: [],
16746
+ respectRobotsTxt: true,
16743
16747
  source: {
16744
16748
  mode: "static-output",
16745
16749
  staticOutputDir: "build",
@@ -16770,7 +16774,7 @@ function createDefaultConfig(projectId) {
16770
16774
  },
16771
16775
  embeddings: {
16772
16776
  provider: "jina",
16773
- model: "jina-embeddings-v3",
16777
+ model: "jina-embeddings-v5-text-small",
16774
16778
  apiKeyEnv: "JINA_API_KEY",
16775
16779
  batchSize: 64,
16776
16780
  concurrency: 4
@@ -16783,9 +16787,9 @@ function createDefaultConfig(projectId) {
16783
16787
  }
16784
16788
  },
16785
16789
  rerank: {
16786
- enabled: false,
16790
+ enabled: true,
16787
16791
  topN: 20,
16788
- model: "jina-reranker-v2-base-multilingual"
16792
+ model: "jina-reranker-v3"
16789
16793
  },
16790
16794
  ranking: {
16791
16795
  enableIncomingLinkBoost: true,
@@ -16904,6 +16908,8 @@ ${issues}`
16904
16908
  ...defaults.scope,
16905
16909
  ...parsed.scope
16906
16910
  },
16911
+ exclude: parsed.exclude ?? defaults.exclude,
16912
+ respectRobotsTxt: parsed.respectRobotsTxt ?? defaults.respectRobotsTxt,
16907
16913
  source: {
16908
16914
  ...defaults.source,
16909
16915
  ...parsed.source,
@@ -17254,7 +17260,7 @@ var JinaReranker = class {
17254
17260
  constructor(options) {
17255
17261
  this.apiKey = options.apiKey;
17256
17262
  this.model = options.model;
17257
- this.maxRetries = options.maxRetries ?? 4;
17263
+ this.maxRetries = options.maxRetries ?? 2;
17258
17264
  }
17259
17265
  async rerank(query, candidates, topN) {
17260
17266
  if (candidates.length === 0) {
@@ -17264,7 +17270,8 @@ var JinaReranker = class {
17264
17270
  model: this.model,
17265
17271
  query,
17266
17272
  documents: candidates.map((candidate) => candidate.text),
17267
- top_n: topN ?? candidates.length
17273
+ top_n: topN ?? candidates.length,
17274
+ return_documents: false
17268
17275
  };
17269
17276
  let attempt = 0;
17270
17277
  while (attempt <= this.maxRetries) {
@@ -17867,6 +17874,36 @@ async function createVectorStore(config, cwd) {
17867
17874
  });
17868
17875
  }
17869
17876
 
17877
+ // src/utils/pattern.ts
17878
+ function matchUrlPattern(url, pattern) {
17879
+ const norm = (p) => p !== "/" && p.endsWith("/") ? p.slice(0, -1) : p;
17880
+ const normalizedUrl = norm(url);
17881
+ const normalizedPattern = norm(pattern);
17882
+ if (normalizedPattern.endsWith("/**")) {
17883
+ const prefix = normalizedPattern.slice(0, -3);
17884
+ if (prefix === "") {
17885
+ return true;
17886
+ }
17887
+ return normalizedUrl === prefix || normalizedUrl.startsWith(prefix + "/");
17888
+ }
17889
+ if (normalizedPattern.endsWith("/*")) {
17890
+ const prefix = normalizedPattern.slice(0, -2);
17891
+ if (prefix === "") {
17892
+ return normalizedUrl !== "/" && !normalizedUrl.slice(1).includes("/");
17893
+ }
17894
+ if (!normalizedUrl.startsWith(prefix + "/")) return false;
17895
+ const rest = normalizedUrl.slice(prefix.length + 1);
17896
+ return rest.length > 0 && !rest.includes("/");
17897
+ }
17898
+ return normalizedUrl === normalizedPattern;
17899
+ }
17900
+ function matchUrlPatterns(url, patterns) {
17901
+ for (const pattern of patterns) {
17902
+ if (matchUrlPattern(url, pattern)) return true;
17903
+ }
17904
+ return false;
17905
+ }
17906
+
17870
17907
  // src/search/ranking.ts
17871
17908
  function nonNegativeOrZero(value) {
17872
17909
  if (!Number.isFinite(value)) {
@@ -17895,21 +17932,11 @@ function rankHits(hits, config) {
17895
17932
  });
17896
17933
  }
17897
17934
  function findPageWeight(url, pageWeights) {
17898
- const norm = (p) => p !== "/" && p.endsWith("/") ? p.slice(0, -1) : p;
17899
- const normalizedUrl = norm(url);
17900
- for (const [pattern, weight] of Object.entries(pageWeights)) {
17901
- if (norm(pattern) === normalizedUrl) {
17902
- return weight;
17903
- }
17904
- }
17905
- let bestPrefix = "";
17935
+ let bestPattern = "";
17906
17936
  let bestWeight = 1;
17907
17937
  for (const [pattern, weight] of Object.entries(pageWeights)) {
17908
- const normalizedPattern = norm(pattern);
17909
- if (normalizedPattern === "/") continue;
17910
- const prefix = `${normalizedPattern}/`;
17911
- if (normalizedUrl.startsWith(prefix) && prefix.length > bestPrefix.length) {
17912
- bestPrefix = prefix;
17938
+ if (matchUrlPattern(url, pattern) && pattern.length > bestPattern.length) {
17939
+ bestPattern = pattern;
17913
17940
  bestWeight = weight;
17914
17941
  }
17915
17942
  }
@@ -17967,7 +17994,8 @@ var requestSchema = zod.z.object({
17967
17994
  pathPrefix: zod.z.string().optional(),
17968
17995
  tags: zod.z.array(zod.z.string()).optional(),
17969
17996
  rerank: zod.z.boolean().optional(),
17970
- groupBy: zod.z.enum(["page", "chunk"]).optional()
17997
+ groupBy: zod.z.enum(["page", "chunk"]).optional(),
17998
+ stream: zod.z.boolean().optional()
17971
17999
  });
17972
18000
  var SearchEngine = class _SearchEngine {
17973
18001
  cwd;
@@ -18040,7 +18068,103 @@ var SearchEngine = class _SearchEngine {
18040
18068
  rerankMs = hrTimeMs(rerankStart);
18041
18069
  usedRerank = true;
18042
18070
  }
18043
- let results;
18071
+ const results = this.buildResults(ordered, topK, groupByPage);
18072
+ return {
18073
+ q: input.q,
18074
+ scope: resolvedScope.scopeName,
18075
+ results,
18076
+ meta: {
18077
+ timingsMs: {
18078
+ embed: Math.round(embedMs),
18079
+ vector: Math.round(vectorMs),
18080
+ rerank: Math.round(rerankMs),
18081
+ total: Math.round(hrTimeMs(totalStart))
18082
+ },
18083
+ usedRerank,
18084
+ modelId: this.config.embeddings.model
18085
+ }
18086
+ };
18087
+ }
18088
+ async *searchStreaming(request) {
18089
+ const parsed = requestSchema.safeParse(request);
18090
+ if (!parsed.success) {
18091
+ throw new SearchSocketError("INVALID_REQUEST", parsed.error.issues[0]?.message ?? "Invalid request", 400);
18092
+ }
18093
+ const input = parsed.data;
18094
+ const wantsRerank = Boolean(input.rerank);
18095
+ if (!wantsRerank) {
18096
+ const response = await this.search(request);
18097
+ yield { phase: "initial", data: response };
18098
+ return;
18099
+ }
18100
+ const totalStart = process.hrtime.bigint();
18101
+ const resolvedScope = resolveScope(this.config, input.scope);
18102
+ await this.assertModelCompatibility(resolvedScope);
18103
+ const topK = input.topK ?? 10;
18104
+ const groupByPage = (input.groupBy ?? "page") === "page";
18105
+ const candidateK = groupByPage ? Math.max(topK * 10, 50) : Math.max(50, topK);
18106
+ const embedStart = process.hrtime.bigint();
18107
+ const queryEmbeddings = await this.embeddings.embedTexts([input.q], this.config.embeddings.model, "retrieval.query");
18108
+ const queryVector = queryEmbeddings[0];
18109
+ if (!queryVector || queryVector.length === 0 || queryVector.some((value) => !Number.isFinite(value))) {
18110
+ throw new SearchSocketError("VECTOR_BACKEND_UNAVAILABLE", "Unable to create query embedding.");
18111
+ }
18112
+ const embedMs = hrTimeMs(embedStart);
18113
+ const vectorStart = process.hrtime.bigint();
18114
+ const hits = await this.vectorStore.query(
18115
+ queryVector,
18116
+ {
18117
+ topK: candidateK,
18118
+ pathPrefix: input.pathPrefix,
18119
+ tags: input.tags
18120
+ },
18121
+ resolvedScope
18122
+ );
18123
+ const vectorMs = hrTimeMs(vectorStart);
18124
+ const ranked = rankHits(hits, this.config);
18125
+ const initialResults = this.buildResults(ranked, topK, groupByPage);
18126
+ yield {
18127
+ phase: "initial",
18128
+ data: {
18129
+ q: input.q,
18130
+ scope: resolvedScope.scopeName,
18131
+ results: initialResults,
18132
+ meta: {
18133
+ timingsMs: {
18134
+ embed: Math.round(embedMs),
18135
+ vector: Math.round(vectorMs),
18136
+ rerank: 0,
18137
+ total: Math.round(hrTimeMs(totalStart))
18138
+ },
18139
+ usedRerank: false,
18140
+ modelId: this.config.embeddings.model
18141
+ }
18142
+ }
18143
+ };
18144
+ const rerankStart = process.hrtime.bigint();
18145
+ const reranked = await this.rerankHits(input.q, ranked, topK);
18146
+ const rerankMs = hrTimeMs(rerankStart);
18147
+ const rerankedResults = this.buildResults(reranked, topK, groupByPage);
18148
+ yield {
18149
+ phase: "reranked",
18150
+ data: {
18151
+ q: input.q,
18152
+ scope: resolvedScope.scopeName,
18153
+ results: rerankedResults,
18154
+ meta: {
18155
+ timingsMs: {
18156
+ embed: Math.round(embedMs),
18157
+ vector: Math.round(vectorMs),
18158
+ rerank: Math.round(rerankMs),
18159
+ total: Math.round(hrTimeMs(totalStart))
18160
+ },
18161
+ usedRerank: true,
18162
+ modelId: this.config.embeddings.model
18163
+ }
18164
+ }
18165
+ };
18166
+ }
18167
+ buildResults(ordered, topK, groupByPage) {
18044
18168
  const minScore = this.config.ranking.minScore;
18045
18169
  if (groupByPage) {
18046
18170
  let pages = aggregateByPage(ordered, this.config);
@@ -18048,10 +18172,10 @@ var SearchEngine = class _SearchEngine {
18048
18172
  pages = pages.filter((p) => p.pageScore >= minScore);
18049
18173
  }
18050
18174
  const minRatio = this.config.ranking.minChunkScoreRatio;
18051
- results = pages.slice(0, topK).map((page) => {
18175
+ return pages.slice(0, topK).map((page) => {
18052
18176
  const bestScore = page.bestChunk.finalScore;
18053
- const minScore2 = Number.isFinite(bestScore) ? bestScore * minRatio : Number.NEGATIVE_INFINITY;
18054
- const meaningful = page.matchingChunks.filter((c) => c.finalScore >= minScore2).slice(0, 5);
18177
+ const minChunkScore = Number.isFinite(bestScore) ? bestScore * minRatio : Number.NEGATIVE_INFINITY;
18178
+ const meaningful = page.matchingChunks.filter((c) => c.finalScore >= minChunkScore).slice(0, 5);
18055
18179
  return {
18056
18180
  url: page.url,
18057
18181
  title: page.title,
@@ -18068,10 +18192,11 @@ var SearchEngine = class _SearchEngine {
18068
18192
  };
18069
18193
  });
18070
18194
  } else {
18195
+ let filtered = ordered;
18071
18196
  if (minScore > 0) {
18072
- ordered = ordered.filter((entry) => entry.finalScore >= minScore);
18197
+ filtered = ordered.filter((entry) => entry.finalScore >= minScore);
18073
18198
  }
18074
- results = ordered.slice(0, topK).map(({ hit, finalScore }) => ({
18199
+ return filtered.slice(0, topK).map(({ hit, finalScore }) => ({
18075
18200
  url: hit.metadata.url,
18076
18201
  title: hit.metadata.title,
18077
18202
  sectionTitle: hit.metadata.sectionTitle || void 0,
@@ -18080,21 +18205,6 @@ var SearchEngine = class _SearchEngine {
18080
18205
  routeFile: hit.metadata.routeFile
18081
18206
  }));
18082
18207
  }
18083
- return {
18084
- q: input.q,
18085
- scope: resolvedScope.scopeName,
18086
- results,
18087
- meta: {
18088
- timingsMs: {
18089
- embed: Math.round(embedMs),
18090
- vector: Math.round(vectorMs),
18091
- rerank: Math.round(rerankMs),
18092
- total: Math.round(hrTimeMs(totalStart))
18093
- },
18094
- usedRerank,
18095
- modelId: this.config.embeddings.model
18096
- }
18097
- };
18098
18208
  }
18099
18209
  async getPage(pathOrUrl, scope) {
18100
18210
  const resolvedScope = resolveScope(this.config, scope);
@@ -18166,6 +18276,7 @@ var SearchEngine = class _SearchEngine {
18166
18276
  const MAX_CHUNKS_PER_PAGE = 5;
18167
18277
  const MIN_CHUNKS_PER_PAGE = 1;
18168
18278
  const MIN_CHUNK_SCORE_RATIO = 0.5;
18279
+ const MAX_DOC_CHARS = 2e3;
18169
18280
  const pageCandidates = [];
18170
18281
  for (const [url, chunks] of pageGroups) {
18171
18282
  const byScore = [...chunks].sort((a, b) => b.finalScore - a.finalScore);
@@ -18185,12 +18296,18 @@ var SearchEngine = class _SearchEngine {
18185
18296
  }
18186
18297
  const body = selected.map((c) => c.hit.metadata.chunkText || c.hit.metadata.snippet).join("\n\n");
18187
18298
  parts.push(body);
18188
- pageCandidates.push({ id: url, text: parts.join("\n\n") });
18299
+ let text = parts.join("\n\n");
18300
+ if (text.length > MAX_DOC_CHARS) {
18301
+ text = text.slice(0, MAX_DOC_CHARS);
18302
+ }
18303
+ pageCandidates.push({ id: url, text });
18189
18304
  }
18305
+ const maxCandidates = Math.max(topK, this.config.rerank.topN);
18306
+ const cappedCandidates = pageCandidates.slice(0, maxCandidates);
18190
18307
  const reranked = await this.reranker.rerank(
18191
18308
  query,
18192
- pageCandidates,
18193
- Math.max(topK, this.config.rerank.topN)
18309
+ cappedCandidates,
18310
+ maxCandidates
18194
18311
  );
18195
18312
  const scoreByUrl = new Map(reranked.map((e) => [e.id, e.score]));
18196
18313
  return ranked.map((entry) => {
@@ -18362,7 +18479,44 @@ function searchsocketHandle(options = {}) {
18362
18479
  throw new SearchSocketError("INVALID_REQUEST", "Malformed JSON request body", 400);
18363
18480
  }
18364
18481
  const engine = await getEngine();
18365
- const result = await engine.search(body);
18482
+ const searchRequest = body;
18483
+ if (searchRequest.stream && searchRequest.rerank) {
18484
+ const encoder = new TextEncoder();
18485
+ const stream = new ReadableStream({
18486
+ async start(controller) {
18487
+ try {
18488
+ for await (const event2 of engine.searchStreaming(searchRequest)) {
18489
+ const line = JSON.stringify(event2) + "\n";
18490
+ controller.enqueue(encoder.encode(line));
18491
+ }
18492
+ } catch (streamError) {
18493
+ const errorEvent = {
18494
+ phase: "error",
18495
+ data: {
18496
+ error: {
18497
+ code: streamError instanceof SearchSocketError ? streamError.code : "INTERNAL_ERROR",
18498
+ message: streamError instanceof Error ? streamError.message : "Unknown error"
18499
+ }
18500
+ }
18501
+ };
18502
+ controller.enqueue(encoder.encode(JSON.stringify(errorEvent) + "\n"));
18503
+ } finally {
18504
+ controller.close();
18505
+ }
18506
+ }
18507
+ });
18508
+ return withCors(
18509
+ new Response(stream, {
18510
+ status: 200,
18511
+ headers: {
18512
+ "content-type": "application/x-ndjson"
18513
+ }
18514
+ }),
18515
+ event.request,
18516
+ config
18517
+ );
18518
+ }
18519
+ const result = await engine.search(searchRequest);
18366
18520
  return withCors(
18367
18521
  new Response(JSON.stringify(result), {
18368
18522
  status: 200,
@@ -19591,6 +19745,17 @@ function extractFromHtml(url, html, config) {
19591
19745
  if ($(`[${config.extract.noindexAttr}]`).length > 0) {
19592
19746
  return null;
19593
19747
  }
19748
+ const weightRaw = $("meta[name='searchsocket-weight']").attr("content")?.trim();
19749
+ let weight;
19750
+ if (weightRaw !== void 0) {
19751
+ const parsed = Number(weightRaw);
19752
+ if (Number.isFinite(parsed) && parsed >= 0) {
19753
+ weight = parsed;
19754
+ }
19755
+ }
19756
+ if (weight === 0) {
19757
+ return null;
19758
+ }
19594
19759
  const description = $("meta[name='description']").attr("content")?.trim() || $("meta[property='og:description']").attr("content")?.trim() || void 0;
19595
19760
  const keywordsRaw = $("meta[name='keywords']").attr("content")?.trim();
19596
19761
  const keywords = keywordsRaw ? keywordsRaw.split(",").map((k) => k.trim()).filter(Boolean) : void 0;
@@ -19646,7 +19811,8 @@ function extractFromHtml(url, html, config) {
19646
19811
  noindex: false,
19647
19812
  tags,
19648
19813
  description,
19649
- keywords
19814
+ keywords,
19815
+ weight
19650
19816
  };
19651
19817
  }
19652
19818
  function extractFromMarkdown(url, markdown, title) {
@@ -19659,6 +19825,14 @@ function extractFromMarkdown(url, markdown, title) {
19659
19825
  if (frontmatter.noindex === true || searchsocketMeta?.noindex === true) {
19660
19826
  return null;
19661
19827
  }
19828
+ let mdWeight;
19829
+ const rawWeight = searchsocketMeta?.weight ?? frontmatter.searchsocketWeight;
19830
+ if (typeof rawWeight === "number" && Number.isFinite(rawWeight) && rawWeight >= 0) {
19831
+ mdWeight = rawWeight;
19832
+ }
19833
+ if (mdWeight === 0) {
19834
+ return null;
19835
+ }
19662
19836
  const content = parsed.content;
19663
19837
  const normalized = normalizeMarkdown(content);
19664
19838
  if (!normalizeText(normalized)) {
@@ -19681,7 +19855,8 @@ function extractFromMarkdown(url, markdown, title) {
19681
19855
  noindex: false,
19682
19856
  tags: normalizeUrlPath(url).split("/").filter(Boolean).slice(0, 1),
19683
19857
  description: fmDescription,
19684
- keywords: fmKeywords
19858
+ keywords: fmKeywords,
19859
+ weight: mdWeight
19685
19860
  };
19686
19861
  }
19687
19862
  function yamlString(value) {
@@ -19950,15 +20125,7 @@ function expandDynamicUrl(url, value) {
19950
20125
  return url.replace(/\[\[?\.\.\.[^\]]+\]?\]|\[\[[^\]]+\]\]|\[[^\]]+\]/g, value);
19951
20126
  }
19952
20127
  function isExcluded(url, patterns) {
19953
- for (const pattern of patterns) {
19954
- if (pattern.endsWith("/*")) {
19955
- const prefix = pattern.slice(0, -1);
19956
- if (url.startsWith(prefix) || url === prefix.slice(0, -1)) return true;
19957
- } else if (url === pattern) {
19958
- return true;
19959
- }
19960
- }
19961
- return false;
20128
+ return matchUrlPatterns(url, patterns);
19962
20129
  }
19963
20130
  function findFreePort() {
19964
20131
  return new Promise((resolve, reject) => {
@@ -20374,12 +20541,83 @@ async function loadStaticOutputPages(cwd, config, maxPages) {
20374
20541
  }
20375
20542
  return pages;
20376
20543
  }
20544
+ function parseRobotsTxt(content, userAgent = "Searchsocket") {
20545
+ const lines = content.split(/\r?\n/);
20546
+ const agentGroups = /* @__PURE__ */ new Map();
20547
+ let currentAgents = [];
20548
+ for (const rawLine of lines) {
20549
+ const line = rawLine.replace(/#.*$/, "").trim();
20550
+ if (!line) continue;
20551
+ const colonIdx = line.indexOf(":");
20552
+ if (colonIdx === -1) continue;
20553
+ const directive = line.slice(0, colonIdx).trim().toLowerCase();
20554
+ const value = line.slice(colonIdx + 1).trim();
20555
+ if (directive === "user-agent") {
20556
+ const agentName = value.toLowerCase();
20557
+ currentAgents.push(agentName);
20558
+ if (!agentGroups.has(agentName)) {
20559
+ agentGroups.set(agentName, { disallow: [], allow: [] });
20560
+ }
20561
+ } else if (directive === "disallow" && value && currentAgents.length > 0) {
20562
+ for (const agent of currentAgents) {
20563
+ agentGroups.get(agent).disallow.push(value);
20564
+ }
20565
+ } else if (directive === "allow" && value && currentAgents.length > 0) {
20566
+ for (const agent of currentAgents) {
20567
+ agentGroups.get(agent).allow.push(value);
20568
+ }
20569
+ } else if (directive !== "disallow" && directive !== "allow") {
20570
+ currentAgents = [];
20571
+ }
20572
+ }
20573
+ const specific = agentGroups.get(userAgent.toLowerCase());
20574
+ if (specific && (specific.disallow.length > 0 || specific.allow.length > 0)) {
20575
+ return specific;
20576
+ }
20577
+ return agentGroups.get("*") ?? { disallow: [], allow: [] };
20578
+ }
20579
+ function isBlockedByRobots(urlPath, rules3) {
20580
+ let longestDisallow = "";
20581
+ for (const pattern of rules3.disallow) {
20582
+ if (urlPath.startsWith(pattern) && pattern.length > longestDisallow.length) {
20583
+ longestDisallow = pattern;
20584
+ }
20585
+ }
20586
+ if (!longestDisallow) return false;
20587
+ let longestAllow = "";
20588
+ for (const pattern of rules3.allow) {
20589
+ if (urlPath.startsWith(pattern) && pattern.length > longestAllow.length) {
20590
+ longestAllow = pattern;
20591
+ }
20592
+ }
20593
+ return longestAllow.length < longestDisallow.length;
20594
+ }
20595
+ async function loadRobotsTxtFromDir(dir) {
20596
+ try {
20597
+ const content = await fs4__default.default.readFile(path__default.default.join(dir, "robots.txt"), "utf8");
20598
+ return parseRobotsTxt(content);
20599
+ } catch {
20600
+ return null;
20601
+ }
20602
+ }
20603
+ async function fetchRobotsTxt(baseUrl) {
20604
+ try {
20605
+ const url = new URL("/robots.txt", baseUrl).href;
20606
+ const response = await fetch(url);
20607
+ if (!response.ok) return null;
20608
+ const content = await response.text();
20609
+ return parseRobotsTxt(content);
20610
+ } catch {
20611
+ return null;
20612
+ }
20613
+ }
20377
20614
 
20378
20615
  // src/indexing/pipeline.ts
20379
20616
  var EMBEDDING_PRICE_PER_1K_TOKENS_USD = {
20380
- "jina-embeddings-v3": 2e-5
20617
+ "jina-embeddings-v3": 2e-5,
20618
+ "jina-embeddings-v5-text-small": 5e-5
20381
20619
  };
20382
- var DEFAULT_EMBEDDING_PRICE_PER_1K = 2e-5;
20620
+ var DEFAULT_EMBEDDING_PRICE_PER_1K = 5e-5;
20383
20621
  var IndexPipeline = class _IndexPipeline {
20384
20622
  cwd;
20385
20623
  config;
@@ -20457,6 +20695,53 @@ var IndexPipeline = class _IndexPipeline {
20457
20695
  }
20458
20696
  stageEnd("source", sourceStart);
20459
20697
  this.logger.info(`Loaded ${sourcePages.length} page${sourcePages.length === 1 ? "" : "s"} (${stageTimingsMs["source"]}ms)`);
20698
+ const filterStart = stageStart();
20699
+ let filteredSourcePages = sourcePages;
20700
+ if (this.config.exclude.length > 0) {
20701
+ const beforeExclude = filteredSourcePages.length;
20702
+ filteredSourcePages = filteredSourcePages.filter((p) => {
20703
+ const url = normalizeUrlPath(p.url);
20704
+ if (matchUrlPatterns(url, this.config.exclude)) {
20705
+ this.logger.debug(`Excluding ${url} (matched exclude pattern)`);
20706
+ return false;
20707
+ }
20708
+ return true;
20709
+ });
20710
+ const excludedCount = beforeExclude - filteredSourcePages.length;
20711
+ if (excludedCount > 0) {
20712
+ this.logger.info(`Excluded ${excludedCount} page${excludedCount === 1 ? "" : "s"} by config exclude patterns`);
20713
+ }
20714
+ }
20715
+ if (this.config.respectRobotsTxt) {
20716
+ let robotsRules = null;
20717
+ if (sourceMode === "static-output") {
20718
+ robotsRules = await loadRobotsTxtFromDir(
20719
+ path__default.default.resolve(this.cwd, this.config.source.staticOutputDir)
20720
+ );
20721
+ } else if (sourceMode === "build" && this.config.source.build) {
20722
+ robotsRules = await loadRobotsTxtFromDir(
20723
+ path__default.default.resolve(this.cwd, this.config.source.build.outputDir)
20724
+ );
20725
+ } else if (sourceMode === "crawl" && this.config.source.crawl) {
20726
+ robotsRules = await fetchRobotsTxt(this.config.source.crawl.baseUrl);
20727
+ }
20728
+ if (robotsRules) {
20729
+ const beforeRobots = filteredSourcePages.length;
20730
+ filteredSourcePages = filteredSourcePages.filter((p) => {
20731
+ const url = normalizeUrlPath(p.url);
20732
+ if (isBlockedByRobots(url, robotsRules)) {
20733
+ this.logger.debug(`Excluding ${url} (blocked by robots.txt)`);
20734
+ return false;
20735
+ }
20736
+ return true;
20737
+ });
20738
+ const robotsExcluded = beforeRobots - filteredSourcePages.length;
20739
+ if (robotsExcluded > 0) {
20740
+ this.logger.info(`Excluded ${robotsExcluded} page${robotsExcluded === 1 ? "" : "s"} by robots.txt`);
20741
+ }
20742
+ }
20743
+ }
20744
+ stageEnd("filter", filterStart);
20460
20745
  const routeStart = stageStart();
20461
20746
  const routePatterns = await buildRoutePatterns(this.cwd);
20462
20747
  stageEnd("route_map", routeStart);
@@ -20464,7 +20749,7 @@ var IndexPipeline = class _IndexPipeline {
20464
20749
  const extractStart = stageStart();
20465
20750
  this.logger.info("Extracting content...");
20466
20751
  const extractedPages = [];
20467
- for (const sourcePage of sourcePages) {
20752
+ for (const sourcePage of filteredSourcePages) {
20468
20753
  const extracted = sourcePage.html ? extractFromHtml(sourcePage.url, sourcePage.html, this.config) : extractFromMarkdown(sourcePage.url, sourcePage.markdown ?? "", sourcePage.title);
20469
20754
  if (!extracted) {
20470
20755
  this.logger.warn(
@@ -20490,16 +20775,29 @@ var IndexPipeline = class _IndexPipeline {
20490
20775
  seenUrls.add(page.url);
20491
20776
  uniquePages.push(page);
20492
20777
  }
20778
+ const indexablePages = [];
20779
+ for (const page of uniquePages) {
20780
+ const effectiveWeight = page.weight ?? findPageWeight(page.url, this.config.ranking.pageWeights);
20781
+ if (effectiveWeight === 0) {
20782
+ this.logger.debug(`Excluding ${page.url} (zero weight)`);
20783
+ continue;
20784
+ }
20785
+ indexablePages.push(page);
20786
+ }
20787
+ const zeroWeightCount = uniquePages.length - indexablePages.length;
20788
+ if (zeroWeightCount > 0) {
20789
+ this.logger.info(`Excluded ${zeroWeightCount} page${zeroWeightCount === 1 ? "" : "s"} with zero weight`);
20790
+ }
20493
20791
  stageEnd("extract", extractStart);
20494
- const skippedPages = sourcePages.length - uniquePages.length;
20495
- this.logger.info(`Extracted ${uniquePages.length} page${uniquePages.length === 1 ? "" : "s"}${skippedPages > 0 ? ` (${skippedPages} skipped)` : ""} (${stageTimingsMs["extract"]}ms)`);
20792
+ const skippedPages = filteredSourcePages.length - indexablePages.length;
20793
+ this.logger.info(`Extracted ${indexablePages.length} page${indexablePages.length === 1 ? "" : "s"}${skippedPages > 0 ? ` (${skippedPages} skipped)` : ""} (${stageTimingsMs["extract"]}ms)`);
20496
20794
  const linkStart = stageStart();
20497
- const pageSet = new Set(uniquePages.map((page) => normalizeUrlPath(page.url)));
20795
+ const pageSet = new Set(indexablePages.map((page) => normalizeUrlPath(page.url)));
20498
20796
  const incomingLinkCount = /* @__PURE__ */ new Map();
20499
- for (const page of uniquePages) {
20797
+ for (const page of indexablePages) {
20500
20798
  incomingLinkCount.set(page.url, incomingLinkCount.get(page.url) ?? 0);
20501
20799
  }
20502
- for (const page of uniquePages) {
20800
+ for (const page of indexablePages) {
20503
20801
  for (const outgoing of page.outgoingLinks) {
20504
20802
  if (!pageSet.has(outgoing)) {
20505
20803
  continue;
@@ -20523,7 +20821,7 @@ var IndexPipeline = class _IndexPipeline {
20523
20821
  });
20524
20822
  }
20525
20823
  }
20526
- for (const page of uniquePages) {
20824
+ for (const page of indexablePages) {
20527
20825
  const routeMatch = precomputedRoutes.get(normalizeUrlPath(page.url)) ?? mapUrlToRoute(page.url, routePatterns);
20528
20826
  if (routeMatch.routeResolution === "best-effort") {
20529
20827
  if (this.config.source.strictRouteMapping) {
@@ -20809,7 +21107,7 @@ function searchsocketVitePlugin(options = {}) {
20809
21107
  });
20810
21108
  const stats = await pipeline.run({
20811
21109
  changedOnly: options.changedOnly ?? true,
20812
- force: options.force ?? false,
21110
+ force: (options.force ?? false) || /^(1|true|yes)$/i.test(process.env.SEARCHSOCKET_FORCE_REINDEX ?? ""),
20813
21111
  dryRun: options.dryRun ?? false,
20814
21112
  scopeOverride: options.scope,
20815
21113
  verbose: options.verbose
@@ -1,4 +1,4 @@
1
- import { R as ResolvedSearchSocketConfig, b as SearchSocketConfig } from './types-BrG6XTUU.cjs';
1
+ import { R as ResolvedSearchSocketConfig, d as SearchSocketConfig } from './types-z2dw3H6E.cjs';
2
2
 
3
3
  interface SearchSocketHandleOptions {
4
4
  configPath?: string;