searchsocket 0.3.3 → 0.4.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -5021,32 +5021,32 @@ var require_URL = __commonJS({
5021
5021
  else
5022
5022
  return basepath.substring(0, lastslash + 1) + refpath;
5023
5023
  }
5024
- function remove_dot_segments(path14) {
5025
- if (!path14) return path14;
5024
+ function remove_dot_segments(path15) {
5025
+ if (!path15) return path15;
5026
5026
  var output = "";
5027
- while (path14.length > 0) {
5028
- if (path14 === "." || path14 === "..") {
5029
- path14 = "";
5027
+ while (path15.length > 0) {
5028
+ if (path15 === "." || path15 === "..") {
5029
+ path15 = "";
5030
5030
  break;
5031
5031
  }
5032
- var twochars = path14.substring(0, 2);
5033
- var threechars = path14.substring(0, 3);
5034
- var fourchars = path14.substring(0, 4);
5032
+ var twochars = path15.substring(0, 2);
5033
+ var threechars = path15.substring(0, 3);
5034
+ var fourchars = path15.substring(0, 4);
5035
5035
  if (threechars === "../") {
5036
- path14 = path14.substring(3);
5036
+ path15 = path15.substring(3);
5037
5037
  } else if (twochars === "./") {
5038
- path14 = path14.substring(2);
5038
+ path15 = path15.substring(2);
5039
5039
  } else if (threechars === "/./") {
5040
- path14 = "/" + path14.substring(3);
5041
- } else if (twochars === "/." && path14.length === 2) {
5042
- path14 = "/";
5043
- } else if (fourchars === "/../" || threechars === "/.." && path14.length === 3) {
5044
- path14 = "/" + path14.substring(4);
5040
+ path15 = "/" + path15.substring(3);
5041
+ } else if (twochars === "/." && path15.length === 2) {
5042
+ path15 = "/";
5043
+ } else if (fourchars === "/../" || threechars === "/.." && path15.length === 3) {
5044
+ path15 = "/" + path15.substring(4);
5045
5045
  output = output.replace(/\/?[^\/]*$/, "");
5046
5046
  } else {
5047
- var segment = path14.match(/(\/?([^\/]*))/)[0];
5047
+ var segment = path15.match(/(\/?([^\/]*))/)[0];
5048
5048
  output += segment;
5049
- path14 = path14.substring(segment.length);
5049
+ path15 = path15.substring(segment.length);
5050
5050
  }
5051
5051
  }
5052
5052
  return output;
@@ -16610,6 +16610,8 @@ var searchSocketConfigSchema = zod.z.object({
16610
16610
  envVar: zod.z.string().min(1).optional(),
16611
16611
  sanitize: zod.z.boolean().optional()
16612
16612
  }).optional(),
16613
+ exclude: zod.z.array(zod.z.string()).optional(),
16614
+ respectRobotsTxt: zod.z.boolean().optional(),
16613
16615
  source: zod.z.object({
16614
16616
  mode: zod.z.enum(["static-output", "crawl", "content-files", "build"]).optional(),
16615
16617
  staticOutputDir: zod.z.string().min(1).optional(),
@@ -16740,6 +16742,8 @@ function createDefaultConfig(projectId) {
16740
16742
  envVar: "SEARCHSOCKET_SCOPE",
16741
16743
  sanitize: true
16742
16744
  },
16745
+ exclude: [],
16746
+ respectRobotsTxt: true,
16743
16747
  source: {
16744
16748
  mode: "static-output",
16745
16749
  staticOutputDir: "build",
@@ -16770,7 +16774,7 @@ function createDefaultConfig(projectId) {
16770
16774
  },
16771
16775
  embeddings: {
16772
16776
  provider: "jina",
16773
- model: "jina-embeddings-v3",
16777
+ model: "jina-embeddings-v5-text-small",
16774
16778
  apiKeyEnv: "JINA_API_KEY",
16775
16779
  batchSize: 64,
16776
16780
  concurrency: 4
@@ -16783,9 +16787,9 @@ function createDefaultConfig(projectId) {
16783
16787
  }
16784
16788
  },
16785
16789
  rerank: {
16786
- enabled: false,
16790
+ enabled: true,
16787
16791
  topN: 20,
16788
- model: "jina-reranker-v2-base-multilingual"
16792
+ model: "jina-reranker-v3"
16789
16793
  },
16790
16794
  ranking: {
16791
16795
  enableIncomingLinkBoost: true,
@@ -16904,6 +16908,8 @@ ${issues}`
16904
16908
  ...defaults.scope,
16905
16909
  ...parsed.scope
16906
16910
  },
16911
+ exclude: parsed.exclude ?? defaults.exclude,
16912
+ respectRobotsTxt: parsed.respectRobotsTxt ?? defaults.respectRobotsTxt,
16907
16913
  source: {
16908
16914
  ...defaults.source,
16909
16915
  ...parsed.source,
@@ -17868,6 +17874,36 @@ async function createVectorStore(config, cwd) {
17868
17874
  });
17869
17875
  }
17870
17876
 
17877
+ // src/utils/pattern.ts
17878
+ function matchUrlPattern(url, pattern) {
17879
+ const norm = (p) => p !== "/" && p.endsWith("/") ? p.slice(0, -1) : p;
17880
+ const normalizedUrl = norm(url);
17881
+ const normalizedPattern = norm(pattern);
17882
+ if (normalizedPattern.endsWith("/**")) {
17883
+ const prefix = normalizedPattern.slice(0, -3);
17884
+ if (prefix === "") {
17885
+ return true;
17886
+ }
17887
+ return normalizedUrl === prefix || normalizedUrl.startsWith(prefix + "/");
17888
+ }
17889
+ if (normalizedPattern.endsWith("/*")) {
17890
+ const prefix = normalizedPattern.slice(0, -2);
17891
+ if (prefix === "") {
17892
+ return normalizedUrl !== "/" && !normalizedUrl.slice(1).includes("/");
17893
+ }
17894
+ if (!normalizedUrl.startsWith(prefix + "/")) return false;
17895
+ const rest = normalizedUrl.slice(prefix.length + 1);
17896
+ return rest.length > 0 && !rest.includes("/");
17897
+ }
17898
+ return normalizedUrl === normalizedPattern;
17899
+ }
17900
+ function matchUrlPatterns(url, patterns) {
17901
+ for (const pattern of patterns) {
17902
+ if (matchUrlPattern(url, pattern)) return true;
17903
+ }
17904
+ return false;
17905
+ }
17906
+
17871
17907
  // src/search/ranking.ts
17872
17908
  function nonNegativeOrZero(value) {
17873
17909
  if (!Number.isFinite(value)) {
@@ -17896,21 +17932,11 @@ function rankHits(hits, config) {
17896
17932
  });
17897
17933
  }
17898
17934
  function findPageWeight(url, pageWeights) {
17899
- const norm = (p) => p !== "/" && p.endsWith("/") ? p.slice(0, -1) : p;
17900
- const normalizedUrl = norm(url);
17901
- for (const [pattern, weight] of Object.entries(pageWeights)) {
17902
- if (norm(pattern) === normalizedUrl) {
17903
- return weight;
17904
- }
17905
- }
17906
- let bestPrefix = "";
17935
+ let bestPattern = "";
17907
17936
  let bestWeight = 1;
17908
17937
  for (const [pattern, weight] of Object.entries(pageWeights)) {
17909
- const normalizedPattern = norm(pattern);
17910
- if (normalizedPattern === "/") continue;
17911
- const prefix = `${normalizedPattern}/`;
17912
- if (normalizedUrl.startsWith(prefix) && prefix.length > bestPrefix.length) {
17913
- bestPrefix = prefix;
17938
+ if (matchUrlPattern(url, pattern) && pattern.length > bestPattern.length) {
17939
+ bestPattern = pattern;
17914
17940
  bestWeight = weight;
17915
17941
  }
17916
17942
  }
@@ -17968,7 +17994,8 @@ var requestSchema = zod.z.object({
17968
17994
  pathPrefix: zod.z.string().optional(),
17969
17995
  tags: zod.z.array(zod.z.string()).optional(),
17970
17996
  rerank: zod.z.boolean().optional(),
17971
- groupBy: zod.z.enum(["page", "chunk"]).optional()
17997
+ groupBy: zod.z.enum(["page", "chunk"]).optional(),
17998
+ stream: zod.z.boolean().optional()
17972
17999
  });
17973
18000
  var SearchEngine = class _SearchEngine {
17974
18001
  cwd;
@@ -18041,7 +18068,103 @@ var SearchEngine = class _SearchEngine {
18041
18068
  rerankMs = hrTimeMs(rerankStart);
18042
18069
  usedRerank = true;
18043
18070
  }
18044
- let results;
18071
+ const results = this.buildResults(ordered, topK, groupByPage);
18072
+ return {
18073
+ q: input.q,
18074
+ scope: resolvedScope.scopeName,
18075
+ results,
18076
+ meta: {
18077
+ timingsMs: {
18078
+ embed: Math.round(embedMs),
18079
+ vector: Math.round(vectorMs),
18080
+ rerank: Math.round(rerankMs),
18081
+ total: Math.round(hrTimeMs(totalStart))
18082
+ },
18083
+ usedRerank,
18084
+ modelId: this.config.embeddings.model
18085
+ }
18086
+ };
18087
+ }
18088
+ async *searchStreaming(request) {
18089
+ const parsed = requestSchema.safeParse(request);
18090
+ if (!parsed.success) {
18091
+ throw new SearchSocketError("INVALID_REQUEST", parsed.error.issues[0]?.message ?? "Invalid request", 400);
18092
+ }
18093
+ const input = parsed.data;
18094
+ const wantsRerank = Boolean(input.rerank);
18095
+ if (!wantsRerank) {
18096
+ const response = await this.search(request);
18097
+ yield { phase: "initial", data: response };
18098
+ return;
18099
+ }
18100
+ const totalStart = process.hrtime.bigint();
18101
+ const resolvedScope = resolveScope(this.config, input.scope);
18102
+ await this.assertModelCompatibility(resolvedScope);
18103
+ const topK = input.topK ?? 10;
18104
+ const groupByPage = (input.groupBy ?? "page") === "page";
18105
+ const candidateK = groupByPage ? Math.max(topK * 10, 50) : Math.max(50, topK);
18106
+ const embedStart = process.hrtime.bigint();
18107
+ const queryEmbeddings = await this.embeddings.embedTexts([input.q], this.config.embeddings.model, "retrieval.query");
18108
+ const queryVector = queryEmbeddings[0];
18109
+ if (!queryVector || queryVector.length === 0 || queryVector.some((value) => !Number.isFinite(value))) {
18110
+ throw new SearchSocketError("VECTOR_BACKEND_UNAVAILABLE", "Unable to create query embedding.");
18111
+ }
18112
+ const embedMs = hrTimeMs(embedStart);
18113
+ const vectorStart = process.hrtime.bigint();
18114
+ const hits = await this.vectorStore.query(
18115
+ queryVector,
18116
+ {
18117
+ topK: candidateK,
18118
+ pathPrefix: input.pathPrefix,
18119
+ tags: input.tags
18120
+ },
18121
+ resolvedScope
18122
+ );
18123
+ const vectorMs = hrTimeMs(vectorStart);
18124
+ const ranked = rankHits(hits, this.config);
18125
+ const initialResults = this.buildResults(ranked, topK, groupByPage);
18126
+ yield {
18127
+ phase: "initial",
18128
+ data: {
18129
+ q: input.q,
18130
+ scope: resolvedScope.scopeName,
18131
+ results: initialResults,
18132
+ meta: {
18133
+ timingsMs: {
18134
+ embed: Math.round(embedMs),
18135
+ vector: Math.round(vectorMs),
18136
+ rerank: 0,
18137
+ total: Math.round(hrTimeMs(totalStart))
18138
+ },
18139
+ usedRerank: false,
18140
+ modelId: this.config.embeddings.model
18141
+ }
18142
+ }
18143
+ };
18144
+ const rerankStart = process.hrtime.bigint();
18145
+ const reranked = await this.rerankHits(input.q, ranked, topK);
18146
+ const rerankMs = hrTimeMs(rerankStart);
18147
+ const rerankedResults = this.buildResults(reranked, topK, groupByPage);
18148
+ yield {
18149
+ phase: "reranked",
18150
+ data: {
18151
+ q: input.q,
18152
+ scope: resolvedScope.scopeName,
18153
+ results: rerankedResults,
18154
+ meta: {
18155
+ timingsMs: {
18156
+ embed: Math.round(embedMs),
18157
+ vector: Math.round(vectorMs),
18158
+ rerank: Math.round(rerankMs),
18159
+ total: Math.round(hrTimeMs(totalStart))
18160
+ },
18161
+ usedRerank: true,
18162
+ modelId: this.config.embeddings.model
18163
+ }
18164
+ }
18165
+ };
18166
+ }
18167
+ buildResults(ordered, topK, groupByPage) {
18045
18168
  const minScore = this.config.ranking.minScore;
18046
18169
  if (groupByPage) {
18047
18170
  let pages = aggregateByPage(ordered, this.config);
@@ -18049,10 +18172,10 @@ var SearchEngine = class _SearchEngine {
18049
18172
  pages = pages.filter((p) => p.pageScore >= minScore);
18050
18173
  }
18051
18174
  const minRatio = this.config.ranking.minChunkScoreRatio;
18052
- results = pages.slice(0, topK).map((page) => {
18175
+ return pages.slice(0, topK).map((page) => {
18053
18176
  const bestScore = page.bestChunk.finalScore;
18054
- const minScore2 = Number.isFinite(bestScore) ? bestScore * minRatio : Number.NEGATIVE_INFINITY;
18055
- const meaningful = page.matchingChunks.filter((c) => c.finalScore >= minScore2).slice(0, 5);
18177
+ const minChunkScore = Number.isFinite(bestScore) ? bestScore * minRatio : Number.NEGATIVE_INFINITY;
18178
+ const meaningful = page.matchingChunks.filter((c) => c.finalScore >= minChunkScore).slice(0, 5);
18056
18179
  return {
18057
18180
  url: page.url,
18058
18181
  title: page.title,
@@ -18069,10 +18192,11 @@ var SearchEngine = class _SearchEngine {
18069
18192
  };
18070
18193
  });
18071
18194
  } else {
18195
+ let filtered = ordered;
18072
18196
  if (minScore > 0) {
18073
- ordered = ordered.filter((entry) => entry.finalScore >= minScore);
18197
+ filtered = ordered.filter((entry) => entry.finalScore >= minScore);
18074
18198
  }
18075
- results = ordered.slice(0, topK).map(({ hit, finalScore }) => ({
18199
+ return filtered.slice(0, topK).map(({ hit, finalScore }) => ({
18076
18200
  url: hit.metadata.url,
18077
18201
  title: hit.metadata.title,
18078
18202
  sectionTitle: hit.metadata.sectionTitle || void 0,
@@ -18081,21 +18205,6 @@ var SearchEngine = class _SearchEngine {
18081
18205
  routeFile: hit.metadata.routeFile
18082
18206
  }));
18083
18207
  }
18084
- return {
18085
- q: input.q,
18086
- scope: resolvedScope.scopeName,
18087
- results,
18088
- meta: {
18089
- timingsMs: {
18090
- embed: Math.round(embedMs),
18091
- vector: Math.round(vectorMs),
18092
- rerank: Math.round(rerankMs),
18093
- total: Math.round(hrTimeMs(totalStart))
18094
- },
18095
- usedRerank,
18096
- modelId: this.config.embeddings.model
18097
- }
18098
- };
18099
18208
  }
18100
18209
  async getPage(pathOrUrl, scope) {
18101
18210
  const resolvedScope = resolveScope(this.config, scope);
@@ -18370,7 +18479,44 @@ function searchsocketHandle(options = {}) {
18370
18479
  throw new SearchSocketError("INVALID_REQUEST", "Malformed JSON request body", 400);
18371
18480
  }
18372
18481
  const engine = await getEngine();
18373
- const result = await engine.search(body);
18482
+ const searchRequest = body;
18483
+ if (searchRequest.stream && searchRequest.rerank) {
18484
+ const encoder = new TextEncoder();
18485
+ const stream = new ReadableStream({
18486
+ async start(controller) {
18487
+ try {
18488
+ for await (const event2 of engine.searchStreaming(searchRequest)) {
18489
+ const line = JSON.stringify(event2) + "\n";
18490
+ controller.enqueue(encoder.encode(line));
18491
+ }
18492
+ } catch (streamError) {
18493
+ const errorEvent = {
18494
+ phase: "error",
18495
+ data: {
18496
+ error: {
18497
+ code: streamError instanceof SearchSocketError ? streamError.code : "INTERNAL_ERROR",
18498
+ message: streamError instanceof Error ? streamError.message : "Unknown error"
18499
+ }
18500
+ }
18501
+ };
18502
+ controller.enqueue(encoder.encode(JSON.stringify(errorEvent) + "\n"));
18503
+ } finally {
18504
+ controller.close();
18505
+ }
18506
+ }
18507
+ });
18508
+ return withCors(
18509
+ new Response(stream, {
18510
+ status: 200,
18511
+ headers: {
18512
+ "content-type": "application/x-ndjson"
18513
+ }
18514
+ }),
18515
+ event.request,
18516
+ config
18517
+ );
18518
+ }
18519
+ const result = await engine.search(searchRequest);
18374
18520
  return withCors(
18375
18521
  new Response(JSON.stringify(result), {
18376
18522
  status: 200,
@@ -19599,6 +19745,17 @@ function extractFromHtml(url, html, config) {
19599
19745
  if ($(`[${config.extract.noindexAttr}]`).length > 0) {
19600
19746
  return null;
19601
19747
  }
19748
+ const weightRaw = $("meta[name='searchsocket-weight']").attr("content")?.trim();
19749
+ let weight;
19750
+ if (weightRaw !== void 0) {
19751
+ const parsed = Number(weightRaw);
19752
+ if (Number.isFinite(parsed) && parsed >= 0) {
19753
+ weight = parsed;
19754
+ }
19755
+ }
19756
+ if (weight === 0) {
19757
+ return null;
19758
+ }
19602
19759
  const description = $("meta[name='description']").attr("content")?.trim() || $("meta[property='og:description']").attr("content")?.trim() || void 0;
19603
19760
  const keywordsRaw = $("meta[name='keywords']").attr("content")?.trim();
19604
19761
  const keywords = keywordsRaw ? keywordsRaw.split(",").map((k) => k.trim()).filter(Boolean) : void 0;
@@ -19654,7 +19811,8 @@ function extractFromHtml(url, html, config) {
19654
19811
  noindex: false,
19655
19812
  tags,
19656
19813
  description,
19657
- keywords
19814
+ keywords,
19815
+ weight
19658
19816
  };
19659
19817
  }
19660
19818
  function extractFromMarkdown(url, markdown, title) {
@@ -19667,6 +19825,14 @@ function extractFromMarkdown(url, markdown, title) {
19667
19825
  if (frontmatter.noindex === true || searchsocketMeta?.noindex === true) {
19668
19826
  return null;
19669
19827
  }
19828
+ let mdWeight;
19829
+ const rawWeight = searchsocketMeta?.weight ?? frontmatter.searchsocketWeight;
19830
+ if (typeof rawWeight === "number" && Number.isFinite(rawWeight) && rawWeight >= 0) {
19831
+ mdWeight = rawWeight;
19832
+ }
19833
+ if (mdWeight === 0) {
19834
+ return null;
19835
+ }
19670
19836
  const content = parsed.content;
19671
19837
  const normalized = normalizeMarkdown(content);
19672
19838
  if (!normalizeText(normalized)) {
@@ -19689,7 +19855,8 @@ function extractFromMarkdown(url, markdown, title) {
19689
19855
  noindex: false,
19690
19856
  tags: normalizeUrlPath(url).split("/").filter(Boolean).slice(0, 1),
19691
19857
  description: fmDescription,
19692
- keywords: fmKeywords
19858
+ keywords: fmKeywords,
19859
+ weight: mdWeight
19693
19860
  };
19694
19861
  }
19695
19862
  function yamlString(value) {
@@ -19958,15 +20125,7 @@ function expandDynamicUrl(url, value) {
19958
20125
  return url.replace(/\[\[?\.\.\.[^\]]+\]?\]|\[\[[^\]]+\]\]|\[[^\]]+\]/g, value);
19959
20126
  }
19960
20127
  function isExcluded(url, patterns) {
19961
- for (const pattern of patterns) {
19962
- if (pattern.endsWith("/*")) {
19963
- const prefix = pattern.slice(0, -1);
19964
- if (url.startsWith(prefix) || url === prefix.slice(0, -1)) return true;
19965
- } else if (url === pattern) {
19966
- return true;
19967
- }
19968
- }
19969
- return false;
20128
+ return matchUrlPatterns(url, patterns);
19970
20129
  }
19971
20130
  function findFreePort() {
19972
20131
  return new Promise((resolve, reject) => {
@@ -20382,12 +20541,83 @@ async function loadStaticOutputPages(cwd, config, maxPages) {
20382
20541
  }
20383
20542
  return pages;
20384
20543
  }
20544
+ function parseRobotsTxt(content, userAgent = "Searchsocket") {
20545
+ const lines = content.split(/\r?\n/);
20546
+ const agentGroups = /* @__PURE__ */ new Map();
20547
+ let currentAgents = [];
20548
+ for (const rawLine of lines) {
20549
+ const line = rawLine.replace(/#.*$/, "").trim();
20550
+ if (!line) continue;
20551
+ const colonIdx = line.indexOf(":");
20552
+ if (colonIdx === -1) continue;
20553
+ const directive = line.slice(0, colonIdx).trim().toLowerCase();
20554
+ const value = line.slice(colonIdx + 1).trim();
20555
+ if (directive === "user-agent") {
20556
+ const agentName = value.toLowerCase();
20557
+ currentAgents.push(agentName);
20558
+ if (!agentGroups.has(agentName)) {
20559
+ agentGroups.set(agentName, { disallow: [], allow: [] });
20560
+ }
20561
+ } else if (directive === "disallow" && value && currentAgents.length > 0) {
20562
+ for (const agent of currentAgents) {
20563
+ agentGroups.get(agent).disallow.push(value);
20564
+ }
20565
+ } else if (directive === "allow" && value && currentAgents.length > 0) {
20566
+ for (const agent of currentAgents) {
20567
+ agentGroups.get(agent).allow.push(value);
20568
+ }
20569
+ } else if (directive !== "disallow" && directive !== "allow") {
20570
+ currentAgents = [];
20571
+ }
20572
+ }
20573
+ const specific = agentGroups.get(userAgent.toLowerCase());
20574
+ if (specific && (specific.disallow.length > 0 || specific.allow.length > 0)) {
20575
+ return specific;
20576
+ }
20577
+ return agentGroups.get("*") ?? { disallow: [], allow: [] };
20578
+ }
20579
+ function isBlockedByRobots(urlPath, rules3) {
20580
+ let longestDisallow = "";
20581
+ for (const pattern of rules3.disallow) {
20582
+ if (urlPath.startsWith(pattern) && pattern.length > longestDisallow.length) {
20583
+ longestDisallow = pattern;
20584
+ }
20585
+ }
20586
+ if (!longestDisallow) return false;
20587
+ let longestAllow = "";
20588
+ for (const pattern of rules3.allow) {
20589
+ if (urlPath.startsWith(pattern) && pattern.length > longestAllow.length) {
20590
+ longestAllow = pattern;
20591
+ }
20592
+ }
20593
+ return longestAllow.length < longestDisallow.length;
20594
+ }
20595
+ async function loadRobotsTxtFromDir(dir) {
20596
+ try {
20597
+ const content = await fs4__default.default.readFile(path__default.default.join(dir, "robots.txt"), "utf8");
20598
+ return parseRobotsTxt(content);
20599
+ } catch {
20600
+ return null;
20601
+ }
20602
+ }
20603
+ async function fetchRobotsTxt(baseUrl) {
20604
+ try {
20605
+ const url = new URL("/robots.txt", baseUrl).href;
20606
+ const response = await fetch(url);
20607
+ if (!response.ok) return null;
20608
+ const content = await response.text();
20609
+ return parseRobotsTxt(content);
20610
+ } catch {
20611
+ return null;
20612
+ }
20613
+ }
20385
20614
 
20386
20615
  // src/indexing/pipeline.ts
20387
20616
  var EMBEDDING_PRICE_PER_1K_TOKENS_USD = {
20388
- "jina-embeddings-v3": 2e-5
20617
+ "jina-embeddings-v3": 2e-5,
20618
+ "jina-embeddings-v5-text-small": 5e-5
20389
20619
  };
20390
- var DEFAULT_EMBEDDING_PRICE_PER_1K = 2e-5;
20620
+ var DEFAULT_EMBEDDING_PRICE_PER_1K = 5e-5;
20391
20621
  var IndexPipeline = class _IndexPipeline {
20392
20622
  cwd;
20393
20623
  config;
@@ -20465,6 +20695,53 @@ var IndexPipeline = class _IndexPipeline {
20465
20695
  }
20466
20696
  stageEnd("source", sourceStart);
20467
20697
  this.logger.info(`Loaded ${sourcePages.length} page${sourcePages.length === 1 ? "" : "s"} (${stageTimingsMs["source"]}ms)`);
20698
+ const filterStart = stageStart();
20699
+ let filteredSourcePages = sourcePages;
20700
+ if (this.config.exclude.length > 0) {
20701
+ const beforeExclude = filteredSourcePages.length;
20702
+ filteredSourcePages = filteredSourcePages.filter((p) => {
20703
+ const url = normalizeUrlPath(p.url);
20704
+ if (matchUrlPatterns(url, this.config.exclude)) {
20705
+ this.logger.debug(`Excluding ${url} (matched exclude pattern)`);
20706
+ return false;
20707
+ }
20708
+ return true;
20709
+ });
20710
+ const excludedCount = beforeExclude - filteredSourcePages.length;
20711
+ if (excludedCount > 0) {
20712
+ this.logger.info(`Excluded ${excludedCount} page${excludedCount === 1 ? "" : "s"} by config exclude patterns`);
20713
+ }
20714
+ }
20715
+ if (this.config.respectRobotsTxt) {
20716
+ let robotsRules = null;
20717
+ if (sourceMode === "static-output") {
20718
+ robotsRules = await loadRobotsTxtFromDir(
20719
+ path__default.default.resolve(this.cwd, this.config.source.staticOutputDir)
20720
+ );
20721
+ } else if (sourceMode === "build" && this.config.source.build) {
20722
+ robotsRules = await loadRobotsTxtFromDir(
20723
+ path__default.default.resolve(this.cwd, this.config.source.build.outputDir)
20724
+ );
20725
+ } else if (sourceMode === "crawl" && this.config.source.crawl) {
20726
+ robotsRules = await fetchRobotsTxt(this.config.source.crawl.baseUrl);
20727
+ }
20728
+ if (robotsRules) {
20729
+ const beforeRobots = filteredSourcePages.length;
20730
+ filteredSourcePages = filteredSourcePages.filter((p) => {
20731
+ const url = normalizeUrlPath(p.url);
20732
+ if (isBlockedByRobots(url, robotsRules)) {
20733
+ this.logger.debug(`Excluding ${url} (blocked by robots.txt)`);
20734
+ return false;
20735
+ }
20736
+ return true;
20737
+ });
20738
+ const robotsExcluded = beforeRobots - filteredSourcePages.length;
20739
+ if (robotsExcluded > 0) {
20740
+ this.logger.info(`Excluded ${robotsExcluded} page${robotsExcluded === 1 ? "" : "s"} by robots.txt`);
20741
+ }
20742
+ }
20743
+ }
20744
+ stageEnd("filter", filterStart);
20468
20745
  const routeStart = stageStart();
20469
20746
  const routePatterns = await buildRoutePatterns(this.cwd);
20470
20747
  stageEnd("route_map", routeStart);
@@ -20472,7 +20749,7 @@ var IndexPipeline = class _IndexPipeline {
20472
20749
  const extractStart = stageStart();
20473
20750
  this.logger.info("Extracting content...");
20474
20751
  const extractedPages = [];
20475
- for (const sourcePage of sourcePages) {
20752
+ for (const sourcePage of filteredSourcePages) {
20476
20753
  const extracted = sourcePage.html ? extractFromHtml(sourcePage.url, sourcePage.html, this.config) : extractFromMarkdown(sourcePage.url, sourcePage.markdown ?? "", sourcePage.title);
20477
20754
  if (!extracted) {
20478
20755
  this.logger.warn(
@@ -20498,16 +20775,29 @@ var IndexPipeline = class _IndexPipeline {
20498
20775
  seenUrls.add(page.url);
20499
20776
  uniquePages.push(page);
20500
20777
  }
20778
+ const indexablePages = [];
20779
+ for (const page of uniquePages) {
20780
+ const effectiveWeight = page.weight ?? findPageWeight(page.url, this.config.ranking.pageWeights);
20781
+ if (effectiveWeight === 0) {
20782
+ this.logger.debug(`Excluding ${page.url} (zero weight)`);
20783
+ continue;
20784
+ }
20785
+ indexablePages.push(page);
20786
+ }
20787
+ const zeroWeightCount = uniquePages.length - indexablePages.length;
20788
+ if (zeroWeightCount > 0) {
20789
+ this.logger.info(`Excluded ${zeroWeightCount} page${zeroWeightCount === 1 ? "" : "s"} with zero weight`);
20790
+ }
20501
20791
  stageEnd("extract", extractStart);
20502
- const skippedPages = sourcePages.length - uniquePages.length;
20503
- this.logger.info(`Extracted ${uniquePages.length} page${uniquePages.length === 1 ? "" : "s"}${skippedPages > 0 ? ` (${skippedPages} skipped)` : ""} (${stageTimingsMs["extract"]}ms)`);
20792
+ const skippedPages = filteredSourcePages.length - indexablePages.length;
20793
+ this.logger.info(`Extracted ${indexablePages.length} page${indexablePages.length === 1 ? "" : "s"}${skippedPages > 0 ? ` (${skippedPages} skipped)` : ""} (${stageTimingsMs["extract"]}ms)`);
20504
20794
  const linkStart = stageStart();
20505
- const pageSet = new Set(uniquePages.map((page) => normalizeUrlPath(page.url)));
20795
+ const pageSet = new Set(indexablePages.map((page) => normalizeUrlPath(page.url)));
20506
20796
  const incomingLinkCount = /* @__PURE__ */ new Map();
20507
- for (const page of uniquePages) {
20797
+ for (const page of indexablePages) {
20508
20798
  incomingLinkCount.set(page.url, incomingLinkCount.get(page.url) ?? 0);
20509
20799
  }
20510
- for (const page of uniquePages) {
20800
+ for (const page of indexablePages) {
20511
20801
  for (const outgoing of page.outgoingLinks) {
20512
20802
  if (!pageSet.has(outgoing)) {
20513
20803
  continue;
@@ -20531,7 +20821,7 @@ var IndexPipeline = class _IndexPipeline {
20531
20821
  });
20532
20822
  }
20533
20823
  }
20534
- for (const page of uniquePages) {
20824
+ for (const page of indexablePages) {
20535
20825
  const routeMatch = precomputedRoutes.get(normalizeUrlPath(page.url)) ?? mapUrlToRoute(page.url, routePatterns);
20536
20826
  if (routeMatch.routeResolution === "best-effort") {
20537
20827
  if (this.config.source.strictRouteMapping) {
@@ -20817,7 +21107,7 @@ function searchsocketVitePlugin(options = {}) {
20817
21107
  });
20818
21108
  const stats = await pipeline.run({
20819
21109
  changedOnly: options.changedOnly ?? true,
20820
- force: options.force ?? false,
21110
+ force: (options.force ?? false) || /^(1|true|yes)$/i.test(process.env.SEARCHSOCKET_FORCE_REINDEX ?? ""),
20821
21111
  dryRun: options.dryRun ?? false,
20822
21112
  scopeOverride: options.scope,
20823
21113
  verbose: options.verbose
@@ -1,4 +1,4 @@
1
- import { R as ResolvedSearchSocketConfig, b as SearchSocketConfig } from './types-BrG6XTUU.cjs';
1
+ import { R as ResolvedSearchSocketConfig, d as SearchSocketConfig } from './types-z2dw3H6E.cjs';
2
2
 
3
3
  interface SearchSocketHandleOptions {
4
4
  configPath?: string;
@@ -1,4 +1,4 @@
1
- import { R as ResolvedSearchSocketConfig, b as SearchSocketConfig } from './types-BrG6XTUU.js';
1
+ import { R as ResolvedSearchSocketConfig, d as SearchSocketConfig } from './types-z2dw3H6E.js';
2
2
 
3
3
  interface SearchSocketHandleOptions {
4
4
  configPath?: string;