searchsocket 0.3.3 → 0.4.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/dist/sveltekit.js CHANGED
@@ -5009,32 +5009,32 @@ var require_URL = __commonJS({
5009
5009
  else
5010
5010
  return basepath.substring(0, lastslash + 1) + refpath;
5011
5011
  }
5012
- function remove_dot_segments(path14) {
5013
- if (!path14) return path14;
5012
+ function remove_dot_segments(path15) {
5013
+ if (!path15) return path15;
5014
5014
  var output = "";
5015
- while (path14.length > 0) {
5016
- if (path14 === "." || path14 === "..") {
5017
- path14 = "";
5015
+ while (path15.length > 0) {
5016
+ if (path15 === "." || path15 === "..") {
5017
+ path15 = "";
5018
5018
  break;
5019
5019
  }
5020
- var twochars = path14.substring(0, 2);
5021
- var threechars = path14.substring(0, 3);
5022
- var fourchars = path14.substring(0, 4);
5020
+ var twochars = path15.substring(0, 2);
5021
+ var threechars = path15.substring(0, 3);
5022
+ var fourchars = path15.substring(0, 4);
5023
5023
  if (threechars === "../") {
5024
- path14 = path14.substring(3);
5024
+ path15 = path15.substring(3);
5025
5025
  } else if (twochars === "./") {
5026
- path14 = path14.substring(2);
5026
+ path15 = path15.substring(2);
5027
5027
  } else if (threechars === "/./") {
5028
- path14 = "/" + path14.substring(3);
5029
- } else if (twochars === "/." && path14.length === 2) {
5030
- path14 = "/";
5031
- } else if (fourchars === "/../" || threechars === "/.." && path14.length === 3) {
5032
- path14 = "/" + path14.substring(4);
5028
+ path15 = "/" + path15.substring(3);
5029
+ } else if (twochars === "/." && path15.length === 2) {
5030
+ path15 = "/";
5031
+ } else if (fourchars === "/../" || threechars === "/.." && path15.length === 3) {
5032
+ path15 = "/" + path15.substring(4);
5033
5033
  output = output.replace(/\/?[^\/]*$/, "");
5034
5034
  } else {
5035
- var segment = path14.match(/(\/?([^\/]*))/)[0];
5035
+ var segment = path15.match(/(\/?([^\/]*))/)[0];
5036
5036
  output += segment;
5037
- path14 = path14.substring(segment.length);
5037
+ path15 = path15.substring(segment.length);
5038
5038
  }
5039
5039
  }
5040
5040
  return output;
@@ -16598,6 +16598,8 @@ var searchSocketConfigSchema = z.object({
16598
16598
  envVar: z.string().min(1).optional(),
16599
16599
  sanitize: z.boolean().optional()
16600
16600
  }).optional(),
16601
+ exclude: z.array(z.string()).optional(),
16602
+ respectRobotsTxt: z.boolean().optional(),
16601
16603
  source: z.object({
16602
16604
  mode: z.enum(["static-output", "crawl", "content-files", "build"]).optional(),
16603
16605
  staticOutputDir: z.string().min(1).optional(),
@@ -16728,6 +16730,8 @@ function createDefaultConfig(projectId) {
16728
16730
  envVar: "SEARCHSOCKET_SCOPE",
16729
16731
  sanitize: true
16730
16732
  },
16733
+ exclude: [],
16734
+ respectRobotsTxt: true,
16731
16735
  source: {
16732
16736
  mode: "static-output",
16733
16737
  staticOutputDir: "build",
@@ -16758,7 +16762,7 @@ function createDefaultConfig(projectId) {
16758
16762
  },
16759
16763
  embeddings: {
16760
16764
  provider: "jina",
16761
- model: "jina-embeddings-v3",
16765
+ model: "jina-embeddings-v5-text-small",
16762
16766
  apiKeyEnv: "JINA_API_KEY",
16763
16767
  batchSize: 64,
16764
16768
  concurrency: 4
@@ -16771,9 +16775,9 @@ function createDefaultConfig(projectId) {
16771
16775
  }
16772
16776
  },
16773
16777
  rerank: {
16774
- enabled: false,
16778
+ enabled: true,
16775
16779
  topN: 20,
16776
- model: "jina-reranker-v2-base-multilingual"
16780
+ model: "jina-reranker-v3"
16777
16781
  },
16778
16782
  ranking: {
16779
16783
  enableIncomingLinkBoost: true,
@@ -16892,6 +16896,8 @@ ${issues}`
16892
16896
  ...defaults.scope,
16893
16897
  ...parsed.scope
16894
16898
  },
16899
+ exclude: parsed.exclude ?? defaults.exclude,
16900
+ respectRobotsTxt: parsed.respectRobotsTxt ?? defaults.respectRobotsTxt,
16895
16901
  source: {
16896
16902
  ...defaults.source,
16897
16903
  ...parsed.source,
@@ -17856,6 +17862,36 @@ async function createVectorStore(config, cwd) {
17856
17862
  });
17857
17863
  }
17858
17864
 
17865
+ // src/utils/pattern.ts
17866
+ function matchUrlPattern(url, pattern) {
17867
+ const norm = (p) => p !== "/" && p.endsWith("/") ? p.slice(0, -1) : p;
17868
+ const normalizedUrl = norm(url);
17869
+ const normalizedPattern = norm(pattern);
17870
+ if (normalizedPattern.endsWith("/**")) {
17871
+ const prefix = normalizedPattern.slice(0, -3);
17872
+ if (prefix === "") {
17873
+ return true;
17874
+ }
17875
+ return normalizedUrl === prefix || normalizedUrl.startsWith(prefix + "/");
17876
+ }
17877
+ if (normalizedPattern.endsWith("/*")) {
17878
+ const prefix = normalizedPattern.slice(0, -2);
17879
+ if (prefix === "") {
17880
+ return normalizedUrl !== "/" && !normalizedUrl.slice(1).includes("/");
17881
+ }
17882
+ if (!normalizedUrl.startsWith(prefix + "/")) return false;
17883
+ const rest = normalizedUrl.slice(prefix.length + 1);
17884
+ return rest.length > 0 && !rest.includes("/");
17885
+ }
17886
+ return normalizedUrl === normalizedPattern;
17887
+ }
17888
+ function matchUrlPatterns(url, patterns) {
17889
+ for (const pattern of patterns) {
17890
+ if (matchUrlPattern(url, pattern)) return true;
17891
+ }
17892
+ return false;
17893
+ }
17894
+
17859
17895
  // src/search/ranking.ts
17860
17896
  function nonNegativeOrZero(value) {
17861
17897
  if (!Number.isFinite(value)) {
@@ -17884,21 +17920,11 @@ function rankHits(hits, config) {
17884
17920
  });
17885
17921
  }
17886
17922
  function findPageWeight(url, pageWeights) {
17887
- const norm = (p) => p !== "/" && p.endsWith("/") ? p.slice(0, -1) : p;
17888
- const normalizedUrl = norm(url);
17889
- for (const [pattern, weight] of Object.entries(pageWeights)) {
17890
- if (norm(pattern) === normalizedUrl) {
17891
- return weight;
17892
- }
17893
- }
17894
- let bestPrefix = "";
17923
+ let bestPattern = "";
17895
17924
  let bestWeight = 1;
17896
17925
  for (const [pattern, weight] of Object.entries(pageWeights)) {
17897
- const normalizedPattern = norm(pattern);
17898
- if (normalizedPattern === "/") continue;
17899
- const prefix = `${normalizedPattern}/`;
17900
- if (normalizedUrl.startsWith(prefix) && prefix.length > bestPrefix.length) {
17901
- bestPrefix = prefix;
17926
+ if (matchUrlPattern(url, pattern) && pattern.length > bestPattern.length) {
17927
+ bestPattern = pattern;
17902
17928
  bestWeight = weight;
17903
17929
  }
17904
17930
  }
@@ -17956,7 +17982,8 @@ var requestSchema = z.object({
17956
17982
  pathPrefix: z.string().optional(),
17957
17983
  tags: z.array(z.string()).optional(),
17958
17984
  rerank: z.boolean().optional(),
17959
- groupBy: z.enum(["page", "chunk"]).optional()
17985
+ groupBy: z.enum(["page", "chunk"]).optional(),
17986
+ stream: z.boolean().optional()
17960
17987
  });
17961
17988
  var SearchEngine = class _SearchEngine {
17962
17989
  cwd;
@@ -18029,7 +18056,103 @@ var SearchEngine = class _SearchEngine {
18029
18056
  rerankMs = hrTimeMs(rerankStart);
18030
18057
  usedRerank = true;
18031
18058
  }
18032
- let results;
18059
+ const results = this.buildResults(ordered, topK, groupByPage);
18060
+ return {
18061
+ q: input.q,
18062
+ scope: resolvedScope.scopeName,
18063
+ results,
18064
+ meta: {
18065
+ timingsMs: {
18066
+ embed: Math.round(embedMs),
18067
+ vector: Math.round(vectorMs),
18068
+ rerank: Math.round(rerankMs),
18069
+ total: Math.round(hrTimeMs(totalStart))
18070
+ },
18071
+ usedRerank,
18072
+ modelId: this.config.embeddings.model
18073
+ }
18074
+ };
18075
+ }
18076
+ async *searchStreaming(request) {
18077
+ const parsed = requestSchema.safeParse(request);
18078
+ if (!parsed.success) {
18079
+ throw new SearchSocketError("INVALID_REQUEST", parsed.error.issues[0]?.message ?? "Invalid request", 400);
18080
+ }
18081
+ const input = parsed.data;
18082
+ const wantsRerank = Boolean(input.rerank);
18083
+ if (!wantsRerank) {
18084
+ const response = await this.search(request);
18085
+ yield { phase: "initial", data: response };
18086
+ return;
18087
+ }
18088
+ const totalStart = process.hrtime.bigint();
18089
+ const resolvedScope = resolveScope(this.config, input.scope);
18090
+ await this.assertModelCompatibility(resolvedScope);
18091
+ const topK = input.topK ?? 10;
18092
+ const groupByPage = (input.groupBy ?? "page") === "page";
18093
+ const candidateK = groupByPage ? Math.max(topK * 10, 50) : Math.max(50, topK);
18094
+ const embedStart = process.hrtime.bigint();
18095
+ const queryEmbeddings = await this.embeddings.embedTexts([input.q], this.config.embeddings.model, "retrieval.query");
18096
+ const queryVector = queryEmbeddings[0];
18097
+ if (!queryVector || queryVector.length === 0 || queryVector.some((value) => !Number.isFinite(value))) {
18098
+ throw new SearchSocketError("VECTOR_BACKEND_UNAVAILABLE", "Unable to create query embedding.");
18099
+ }
18100
+ const embedMs = hrTimeMs(embedStart);
18101
+ const vectorStart = process.hrtime.bigint();
18102
+ const hits = await this.vectorStore.query(
18103
+ queryVector,
18104
+ {
18105
+ topK: candidateK,
18106
+ pathPrefix: input.pathPrefix,
18107
+ tags: input.tags
18108
+ },
18109
+ resolvedScope
18110
+ );
18111
+ const vectorMs = hrTimeMs(vectorStart);
18112
+ const ranked = rankHits(hits, this.config);
18113
+ const initialResults = this.buildResults(ranked, topK, groupByPage);
18114
+ yield {
18115
+ phase: "initial",
18116
+ data: {
18117
+ q: input.q,
18118
+ scope: resolvedScope.scopeName,
18119
+ results: initialResults,
18120
+ meta: {
18121
+ timingsMs: {
18122
+ embed: Math.round(embedMs),
18123
+ vector: Math.round(vectorMs),
18124
+ rerank: 0,
18125
+ total: Math.round(hrTimeMs(totalStart))
18126
+ },
18127
+ usedRerank: false,
18128
+ modelId: this.config.embeddings.model
18129
+ }
18130
+ }
18131
+ };
18132
+ const rerankStart = process.hrtime.bigint();
18133
+ const reranked = await this.rerankHits(input.q, ranked, topK);
18134
+ const rerankMs = hrTimeMs(rerankStart);
18135
+ const rerankedResults = this.buildResults(reranked, topK, groupByPage);
18136
+ yield {
18137
+ phase: "reranked",
18138
+ data: {
18139
+ q: input.q,
18140
+ scope: resolvedScope.scopeName,
18141
+ results: rerankedResults,
18142
+ meta: {
18143
+ timingsMs: {
18144
+ embed: Math.round(embedMs),
18145
+ vector: Math.round(vectorMs),
18146
+ rerank: Math.round(rerankMs),
18147
+ total: Math.round(hrTimeMs(totalStart))
18148
+ },
18149
+ usedRerank: true,
18150
+ modelId: this.config.embeddings.model
18151
+ }
18152
+ }
18153
+ };
18154
+ }
18155
+ buildResults(ordered, topK, groupByPage) {
18033
18156
  const minScore = this.config.ranking.minScore;
18034
18157
  if (groupByPage) {
18035
18158
  let pages = aggregateByPage(ordered, this.config);
@@ -18037,10 +18160,10 @@ var SearchEngine = class _SearchEngine {
18037
18160
  pages = pages.filter((p) => p.pageScore >= minScore);
18038
18161
  }
18039
18162
  const minRatio = this.config.ranking.minChunkScoreRatio;
18040
- results = pages.slice(0, topK).map((page) => {
18163
+ return pages.slice(0, topK).map((page) => {
18041
18164
  const bestScore = page.bestChunk.finalScore;
18042
- const minScore2 = Number.isFinite(bestScore) ? bestScore * minRatio : Number.NEGATIVE_INFINITY;
18043
- const meaningful = page.matchingChunks.filter((c) => c.finalScore >= minScore2).slice(0, 5);
18165
+ const minChunkScore = Number.isFinite(bestScore) ? bestScore * minRatio : Number.NEGATIVE_INFINITY;
18166
+ const meaningful = page.matchingChunks.filter((c) => c.finalScore >= minChunkScore).slice(0, 5);
18044
18167
  return {
18045
18168
  url: page.url,
18046
18169
  title: page.title,
@@ -18057,10 +18180,11 @@ var SearchEngine = class _SearchEngine {
18057
18180
  };
18058
18181
  });
18059
18182
  } else {
18183
+ let filtered = ordered;
18060
18184
  if (minScore > 0) {
18061
- ordered = ordered.filter((entry) => entry.finalScore >= minScore);
18185
+ filtered = ordered.filter((entry) => entry.finalScore >= minScore);
18062
18186
  }
18063
- results = ordered.slice(0, topK).map(({ hit, finalScore }) => ({
18187
+ return filtered.slice(0, topK).map(({ hit, finalScore }) => ({
18064
18188
  url: hit.metadata.url,
18065
18189
  title: hit.metadata.title,
18066
18190
  sectionTitle: hit.metadata.sectionTitle || void 0,
@@ -18069,21 +18193,6 @@ var SearchEngine = class _SearchEngine {
18069
18193
  routeFile: hit.metadata.routeFile
18070
18194
  }));
18071
18195
  }
18072
- return {
18073
- q: input.q,
18074
- scope: resolvedScope.scopeName,
18075
- results,
18076
- meta: {
18077
- timingsMs: {
18078
- embed: Math.round(embedMs),
18079
- vector: Math.round(vectorMs),
18080
- rerank: Math.round(rerankMs),
18081
- total: Math.round(hrTimeMs(totalStart))
18082
- },
18083
- usedRerank,
18084
- modelId: this.config.embeddings.model
18085
- }
18086
- };
18087
18196
  }
18088
18197
  async getPage(pathOrUrl, scope) {
18089
18198
  const resolvedScope = resolveScope(this.config, scope);
@@ -18358,7 +18467,44 @@ function searchsocketHandle(options = {}) {
18358
18467
  throw new SearchSocketError("INVALID_REQUEST", "Malformed JSON request body", 400);
18359
18468
  }
18360
18469
  const engine = await getEngine();
18361
- const result = await engine.search(body);
18470
+ const searchRequest = body;
18471
+ if (searchRequest.stream && searchRequest.rerank) {
18472
+ const encoder = new TextEncoder();
18473
+ const stream = new ReadableStream({
18474
+ async start(controller) {
18475
+ try {
18476
+ for await (const event2 of engine.searchStreaming(searchRequest)) {
18477
+ const line = JSON.stringify(event2) + "\n";
18478
+ controller.enqueue(encoder.encode(line));
18479
+ }
18480
+ } catch (streamError) {
18481
+ const errorEvent = {
18482
+ phase: "error",
18483
+ data: {
18484
+ error: {
18485
+ code: streamError instanceof SearchSocketError ? streamError.code : "INTERNAL_ERROR",
18486
+ message: streamError instanceof Error ? streamError.message : "Unknown error"
18487
+ }
18488
+ }
18489
+ };
18490
+ controller.enqueue(encoder.encode(JSON.stringify(errorEvent) + "\n"));
18491
+ } finally {
18492
+ controller.close();
18493
+ }
18494
+ }
18495
+ });
18496
+ return withCors(
18497
+ new Response(stream, {
18498
+ status: 200,
18499
+ headers: {
18500
+ "content-type": "application/x-ndjson"
18501
+ }
18502
+ }),
18503
+ event.request,
18504
+ config
18505
+ );
18506
+ }
18507
+ const result = await engine.search(searchRequest);
18362
18508
  return withCors(
18363
18509
  new Response(JSON.stringify(result), {
18364
18510
  status: 200,
@@ -19587,6 +19733,17 @@ function extractFromHtml(url, html, config) {
19587
19733
  if ($(`[${config.extract.noindexAttr}]`).length > 0) {
19588
19734
  return null;
19589
19735
  }
19736
+ const weightRaw = $("meta[name='searchsocket-weight']").attr("content")?.trim();
19737
+ let weight;
19738
+ if (weightRaw !== void 0) {
19739
+ const parsed = Number(weightRaw);
19740
+ if (Number.isFinite(parsed) && parsed >= 0) {
19741
+ weight = parsed;
19742
+ }
19743
+ }
19744
+ if (weight === 0) {
19745
+ return null;
19746
+ }
19590
19747
  const description = $("meta[name='description']").attr("content")?.trim() || $("meta[property='og:description']").attr("content")?.trim() || void 0;
19591
19748
  const keywordsRaw = $("meta[name='keywords']").attr("content")?.trim();
19592
19749
  const keywords = keywordsRaw ? keywordsRaw.split(",").map((k) => k.trim()).filter(Boolean) : void 0;
@@ -19642,7 +19799,8 @@ function extractFromHtml(url, html, config) {
19642
19799
  noindex: false,
19643
19800
  tags,
19644
19801
  description,
19645
- keywords
19802
+ keywords,
19803
+ weight
19646
19804
  };
19647
19805
  }
19648
19806
  function extractFromMarkdown(url, markdown, title) {
@@ -19655,6 +19813,14 @@ function extractFromMarkdown(url, markdown, title) {
19655
19813
  if (frontmatter.noindex === true || searchsocketMeta?.noindex === true) {
19656
19814
  return null;
19657
19815
  }
19816
+ let mdWeight;
19817
+ const rawWeight = searchsocketMeta?.weight ?? frontmatter.searchsocketWeight;
19818
+ if (typeof rawWeight === "number" && Number.isFinite(rawWeight) && rawWeight >= 0) {
19819
+ mdWeight = rawWeight;
19820
+ }
19821
+ if (mdWeight === 0) {
19822
+ return null;
19823
+ }
19658
19824
  const content = parsed.content;
19659
19825
  const normalized = normalizeMarkdown(content);
19660
19826
  if (!normalizeText(normalized)) {
@@ -19677,7 +19843,8 @@ function extractFromMarkdown(url, markdown, title) {
19677
19843
  noindex: false,
19678
19844
  tags: normalizeUrlPath(url).split("/").filter(Boolean).slice(0, 1),
19679
19845
  description: fmDescription,
19680
- keywords: fmKeywords
19846
+ keywords: fmKeywords,
19847
+ weight: mdWeight
19681
19848
  };
19682
19849
  }
19683
19850
  function yamlString(value) {
@@ -19946,15 +20113,7 @@ function expandDynamicUrl(url, value) {
19946
20113
  return url.replace(/\[\[?\.\.\.[^\]]+\]?\]|\[\[[^\]]+\]\]|\[[^\]]+\]/g, value);
19947
20114
  }
19948
20115
  function isExcluded(url, patterns) {
19949
- for (const pattern of patterns) {
19950
- if (pattern.endsWith("/*")) {
19951
- const prefix = pattern.slice(0, -1);
19952
- if (url.startsWith(prefix) || url === prefix.slice(0, -1)) return true;
19953
- } else if (url === pattern) {
19954
- return true;
19955
- }
19956
- }
19957
- return false;
20116
+ return matchUrlPatterns(url, patterns);
19958
20117
  }
19959
20118
  function findFreePort() {
19960
20119
  return new Promise((resolve, reject) => {
@@ -20370,12 +20529,83 @@ async function loadStaticOutputPages(cwd, config, maxPages) {
20370
20529
  }
20371
20530
  return pages;
20372
20531
  }
20532
+ function parseRobotsTxt(content, userAgent = "Searchsocket") {
20533
+ const lines = content.split(/\r?\n/);
20534
+ const agentGroups = /* @__PURE__ */ new Map();
20535
+ let currentAgents = [];
20536
+ for (const rawLine of lines) {
20537
+ const line = rawLine.replace(/#.*$/, "").trim();
20538
+ if (!line) continue;
20539
+ const colonIdx = line.indexOf(":");
20540
+ if (colonIdx === -1) continue;
20541
+ const directive = line.slice(0, colonIdx).trim().toLowerCase();
20542
+ const value = line.slice(colonIdx + 1).trim();
20543
+ if (directive === "user-agent") {
20544
+ const agentName = value.toLowerCase();
20545
+ currentAgents.push(agentName);
20546
+ if (!agentGroups.has(agentName)) {
20547
+ agentGroups.set(agentName, { disallow: [], allow: [] });
20548
+ }
20549
+ } else if (directive === "disallow" && value && currentAgents.length > 0) {
20550
+ for (const agent of currentAgents) {
20551
+ agentGroups.get(agent).disallow.push(value);
20552
+ }
20553
+ } else if (directive === "allow" && value && currentAgents.length > 0) {
20554
+ for (const agent of currentAgents) {
20555
+ agentGroups.get(agent).allow.push(value);
20556
+ }
20557
+ } else if (directive !== "disallow" && directive !== "allow") {
20558
+ currentAgents = [];
20559
+ }
20560
+ }
20561
+ const specific = agentGroups.get(userAgent.toLowerCase());
20562
+ if (specific && (specific.disallow.length > 0 || specific.allow.length > 0)) {
20563
+ return specific;
20564
+ }
20565
+ return agentGroups.get("*") ?? { disallow: [], allow: [] };
20566
+ }
20567
+ function isBlockedByRobots(urlPath, rules3) {
20568
+ let longestDisallow = "";
20569
+ for (const pattern of rules3.disallow) {
20570
+ if (urlPath.startsWith(pattern) && pattern.length > longestDisallow.length) {
20571
+ longestDisallow = pattern;
20572
+ }
20573
+ }
20574
+ if (!longestDisallow) return false;
20575
+ let longestAllow = "";
20576
+ for (const pattern of rules3.allow) {
20577
+ if (urlPath.startsWith(pattern) && pattern.length > longestAllow.length) {
20578
+ longestAllow = pattern;
20579
+ }
20580
+ }
20581
+ return longestAllow.length < longestDisallow.length;
20582
+ }
20583
+ async function loadRobotsTxtFromDir(dir) {
20584
+ try {
20585
+ const content = await fs4.readFile(path.join(dir, "robots.txt"), "utf8");
20586
+ return parseRobotsTxt(content);
20587
+ } catch {
20588
+ return null;
20589
+ }
20590
+ }
20591
+ async function fetchRobotsTxt(baseUrl) {
20592
+ try {
20593
+ const url = new URL("/robots.txt", baseUrl).href;
20594
+ const response = await fetch(url);
20595
+ if (!response.ok) return null;
20596
+ const content = await response.text();
20597
+ return parseRobotsTxt(content);
20598
+ } catch {
20599
+ return null;
20600
+ }
20601
+ }
20373
20602
 
20374
20603
  // src/indexing/pipeline.ts
20375
20604
  var EMBEDDING_PRICE_PER_1K_TOKENS_USD = {
20376
- "jina-embeddings-v3": 2e-5
20605
+ "jina-embeddings-v3": 2e-5,
20606
+ "jina-embeddings-v5-text-small": 5e-5
20377
20607
  };
20378
- var DEFAULT_EMBEDDING_PRICE_PER_1K = 2e-5;
20608
+ var DEFAULT_EMBEDDING_PRICE_PER_1K = 5e-5;
20379
20609
  var IndexPipeline = class _IndexPipeline {
20380
20610
  cwd;
20381
20611
  config;
@@ -20453,6 +20683,53 @@ var IndexPipeline = class _IndexPipeline {
20453
20683
  }
20454
20684
  stageEnd("source", sourceStart);
20455
20685
  this.logger.info(`Loaded ${sourcePages.length} page${sourcePages.length === 1 ? "" : "s"} (${stageTimingsMs["source"]}ms)`);
20686
+ const filterStart = stageStart();
20687
+ let filteredSourcePages = sourcePages;
20688
+ if (this.config.exclude.length > 0) {
20689
+ const beforeExclude = filteredSourcePages.length;
20690
+ filteredSourcePages = filteredSourcePages.filter((p) => {
20691
+ const url = normalizeUrlPath(p.url);
20692
+ if (matchUrlPatterns(url, this.config.exclude)) {
20693
+ this.logger.debug(`Excluding ${url} (matched exclude pattern)`);
20694
+ return false;
20695
+ }
20696
+ return true;
20697
+ });
20698
+ const excludedCount = beforeExclude - filteredSourcePages.length;
20699
+ if (excludedCount > 0) {
20700
+ this.logger.info(`Excluded ${excludedCount} page${excludedCount === 1 ? "" : "s"} by config exclude patterns`);
20701
+ }
20702
+ }
20703
+ if (this.config.respectRobotsTxt) {
20704
+ let robotsRules = null;
20705
+ if (sourceMode === "static-output") {
20706
+ robotsRules = await loadRobotsTxtFromDir(
20707
+ path.resolve(this.cwd, this.config.source.staticOutputDir)
20708
+ );
20709
+ } else if (sourceMode === "build" && this.config.source.build) {
20710
+ robotsRules = await loadRobotsTxtFromDir(
20711
+ path.resolve(this.cwd, this.config.source.build.outputDir)
20712
+ );
20713
+ } else if (sourceMode === "crawl" && this.config.source.crawl) {
20714
+ robotsRules = await fetchRobotsTxt(this.config.source.crawl.baseUrl);
20715
+ }
20716
+ if (robotsRules) {
20717
+ const beforeRobots = filteredSourcePages.length;
20718
+ filteredSourcePages = filteredSourcePages.filter((p) => {
20719
+ const url = normalizeUrlPath(p.url);
20720
+ if (isBlockedByRobots(url, robotsRules)) {
20721
+ this.logger.debug(`Excluding ${url} (blocked by robots.txt)`);
20722
+ return false;
20723
+ }
20724
+ return true;
20725
+ });
20726
+ const robotsExcluded = beforeRobots - filteredSourcePages.length;
20727
+ if (robotsExcluded > 0) {
20728
+ this.logger.info(`Excluded ${robotsExcluded} page${robotsExcluded === 1 ? "" : "s"} by robots.txt`);
20729
+ }
20730
+ }
20731
+ }
20732
+ stageEnd("filter", filterStart);
20456
20733
  const routeStart = stageStart();
20457
20734
  const routePatterns = await buildRoutePatterns(this.cwd);
20458
20735
  stageEnd("route_map", routeStart);
@@ -20460,7 +20737,7 @@ var IndexPipeline = class _IndexPipeline {
20460
20737
  const extractStart = stageStart();
20461
20738
  this.logger.info("Extracting content...");
20462
20739
  const extractedPages = [];
20463
- for (const sourcePage of sourcePages) {
20740
+ for (const sourcePage of filteredSourcePages) {
20464
20741
  const extracted = sourcePage.html ? extractFromHtml(sourcePage.url, sourcePage.html, this.config) : extractFromMarkdown(sourcePage.url, sourcePage.markdown ?? "", sourcePage.title);
20465
20742
  if (!extracted) {
20466
20743
  this.logger.warn(
@@ -20486,16 +20763,29 @@ var IndexPipeline = class _IndexPipeline {
20486
20763
  seenUrls.add(page.url);
20487
20764
  uniquePages.push(page);
20488
20765
  }
20766
+ const indexablePages = [];
20767
+ for (const page of uniquePages) {
20768
+ const effectiveWeight = page.weight ?? findPageWeight(page.url, this.config.ranking.pageWeights);
20769
+ if (effectiveWeight === 0) {
20770
+ this.logger.debug(`Excluding ${page.url} (zero weight)`);
20771
+ continue;
20772
+ }
20773
+ indexablePages.push(page);
20774
+ }
20775
+ const zeroWeightCount = uniquePages.length - indexablePages.length;
20776
+ if (zeroWeightCount > 0) {
20777
+ this.logger.info(`Excluded ${zeroWeightCount} page${zeroWeightCount === 1 ? "" : "s"} with zero weight`);
20778
+ }
20489
20779
  stageEnd("extract", extractStart);
20490
- const skippedPages = sourcePages.length - uniquePages.length;
20491
- this.logger.info(`Extracted ${uniquePages.length} page${uniquePages.length === 1 ? "" : "s"}${skippedPages > 0 ? ` (${skippedPages} skipped)` : ""} (${stageTimingsMs["extract"]}ms)`);
20780
+ const skippedPages = filteredSourcePages.length - indexablePages.length;
20781
+ this.logger.info(`Extracted ${indexablePages.length} page${indexablePages.length === 1 ? "" : "s"}${skippedPages > 0 ? ` (${skippedPages} skipped)` : ""} (${stageTimingsMs["extract"]}ms)`);
20492
20782
  const linkStart = stageStart();
20493
- const pageSet = new Set(uniquePages.map((page) => normalizeUrlPath(page.url)));
20783
+ const pageSet = new Set(indexablePages.map((page) => normalizeUrlPath(page.url)));
20494
20784
  const incomingLinkCount = /* @__PURE__ */ new Map();
20495
- for (const page of uniquePages) {
20785
+ for (const page of indexablePages) {
20496
20786
  incomingLinkCount.set(page.url, incomingLinkCount.get(page.url) ?? 0);
20497
20787
  }
20498
- for (const page of uniquePages) {
20788
+ for (const page of indexablePages) {
20499
20789
  for (const outgoing of page.outgoingLinks) {
20500
20790
  if (!pageSet.has(outgoing)) {
20501
20791
  continue;
@@ -20519,7 +20809,7 @@ var IndexPipeline = class _IndexPipeline {
20519
20809
  });
20520
20810
  }
20521
20811
  }
20522
- for (const page of uniquePages) {
20812
+ for (const page of indexablePages) {
20523
20813
  const routeMatch = precomputedRoutes.get(normalizeUrlPath(page.url)) ?? mapUrlToRoute(page.url, routePatterns);
20524
20814
  if (routeMatch.routeResolution === "best-effort") {
20525
20815
  if (this.config.source.strictRouteMapping) {
@@ -20805,7 +21095,7 @@ function searchsocketVitePlugin(options = {}) {
20805
21095
  });
20806
21096
  const stats = await pipeline.run({
20807
21097
  changedOnly: options.changedOnly ?? true,
20808
- force: options.force ?? false,
21098
+ force: (options.force ?? false) || /^(1|true|yes)$/i.test(process.env.SEARCHSOCKET_FORCE_REINDEX ?? ""),
20809
21099
  dryRun: options.dryRun ?? false,
20810
21100
  scopeOverride: options.scope,
20811
21101
  verbose: options.verbose