@ainyc/canonry 4.85.0 → 4.86.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/assets/agent-workspace/skills/canonry/references/canonry-cli.md +3 -1
- package/assets/assets/{BacklinksPage-CDAv0ggn.js → BacklinksPage-BNrvc-gV.js} +1 -1
- package/assets/assets/{ChartPrimitives-CnAmsyt7.js → ChartPrimitives-BlIkdUdy.js} +1 -1
- package/assets/assets/{ProjectPage-C9KEgRxD.js → ProjectPage-CAyx_xNr.js} +2 -2
- package/assets/assets/{RunRow-CVZ5o8fg.js → RunRow-CAPnKzi7.js} +1 -1
- package/assets/assets/{RunsPage-Bzy5c0MZ.js → RunsPage-idnuzKBn.js} +1 -1
- package/assets/assets/{SettingsPage-B1ocxPBe.js → SettingsPage-Bka67uJq.js} +1 -1
- package/assets/assets/{TrafficPage-D2zepQOC.js → TrafficPage-C_o-rA5o.js} +1 -1
- package/assets/assets/{TrafficSourceDetailPage-C7JuAkaK.js → TrafficSourceDetailPage-D_jvoSTV.js} +1 -1
- package/assets/assets/{arrow-left-Bv3CWylm.js → arrow-left-B-JfzARi.js} +1 -1
- package/assets/assets/{extract-error-message-BtVid5TP.js → extract-error-message-BhPbjIX6.js} +1 -1
- package/assets/assets/{index-DmNti_xn.js → index-uPSrDA8e.js} +61 -61
- package/assets/assets/{trash-2-BoimCsYz.js → trash-2-BbRvn40h.js} +1 -1
- package/assets/index.html +1 -1
- package/dist/{chunk-I2BJC3DT.js → chunk-23HGQV22.js} +439 -207
- package/dist/{chunk-3K3QRSYE.js → chunk-DLBQU3VG.js} +93 -2
- package/dist/{chunk-62YB3ML7.js → chunk-LLJPZKHG.js} +46 -1
- package/dist/{chunk-7BMSWI2K.js → chunk-SELXBOAP.js} +19 -4
- package/dist/cli.js +63 -4
- package/dist/index.js +4 -4
- package/dist/{intelligence-service-AHHBQKRD.js → intelligence-service-ZHUJKZRO.js} +2 -2
- package/dist/mcp.js +2 -2
- package/package.json +7 -7
|
@@ -1832,6 +1832,210 @@ import { z as z18 } from "zod";
|
|
|
1832
1832
|
|
|
1833
1833
|
// ../contracts/src/discovery.ts
|
|
1834
1834
|
import { z as z17 } from "zod";
|
|
1835
|
+
|
|
1836
|
+
// ../contracts/src/embeddings.ts
|
|
1837
|
+
function cosineSimilarity(a, b) {
|
|
1838
|
+
if (a.length === 0 || b.length === 0) {
|
|
1839
|
+
throw new Error("cosineSimilarity: vectors must be non-empty");
|
|
1840
|
+
}
|
|
1841
|
+
if (a.length !== b.length) {
|
|
1842
|
+
throw new Error(`cosineSimilarity: vector length mismatch (${a.length} vs ${b.length})`);
|
|
1843
|
+
}
|
|
1844
|
+
let dot = 0;
|
|
1845
|
+
let magA = 0;
|
|
1846
|
+
let magB = 0;
|
|
1847
|
+
for (let i = 0; i < a.length; i++) {
|
|
1848
|
+
dot += a[i] * b[i];
|
|
1849
|
+
magA += a[i] * a[i];
|
|
1850
|
+
magB += b[i] * b[i];
|
|
1851
|
+
}
|
|
1852
|
+
if (magA === 0 || magB === 0) return 0;
|
|
1853
|
+
return dot / (Math.sqrt(magA) * Math.sqrt(magB));
|
|
1854
|
+
}
|
|
1855
|
+
function clusterByCosine(items, vectors, threshold) {
|
|
1856
|
+
if (threshold < 0 || threshold > 1) {
|
|
1857
|
+
throw new Error(`clusterByCosine: threshold must be in [0, 1], got ${threshold}`);
|
|
1858
|
+
}
|
|
1859
|
+
if (items.length !== vectors.length) {
|
|
1860
|
+
throw new Error(`clusterByCosine: items/vectors length mismatch (${items.length} vs ${vectors.length})`);
|
|
1861
|
+
}
|
|
1862
|
+
if (items.length === 0) return [];
|
|
1863
|
+
const parent = items.map((_, i) => i);
|
|
1864
|
+
const find = (x) => {
|
|
1865
|
+
let root = x;
|
|
1866
|
+
while (parent[root] !== root) root = parent[root];
|
|
1867
|
+
let cur = x;
|
|
1868
|
+
while (parent[cur] !== root) {
|
|
1869
|
+
const next = parent[cur];
|
|
1870
|
+
parent[cur] = root;
|
|
1871
|
+
cur = next;
|
|
1872
|
+
}
|
|
1873
|
+
return root;
|
|
1874
|
+
};
|
|
1875
|
+
const union = (a, b) => {
|
|
1876
|
+
const ra = find(a);
|
|
1877
|
+
const rb = find(b);
|
|
1878
|
+
if (ra !== rb) parent[ra] = rb;
|
|
1879
|
+
};
|
|
1880
|
+
for (let i = 0; i < items.length; i++) {
|
|
1881
|
+
for (let j = i + 1; j < items.length; j++) {
|
|
1882
|
+
if (cosineSimilarity(vectors[i], vectors[j]) >= threshold) {
|
|
1883
|
+
union(i, j);
|
|
1884
|
+
}
|
|
1885
|
+
}
|
|
1886
|
+
}
|
|
1887
|
+
const byRoot = /* @__PURE__ */ new Map();
|
|
1888
|
+
for (let i = 0; i < items.length; i++) {
|
|
1889
|
+
const root = find(i);
|
|
1890
|
+
const existing = byRoot.get(root);
|
|
1891
|
+
if (existing) existing.push(i);
|
|
1892
|
+
else byRoot.set(root, [i]);
|
|
1893
|
+
}
|
|
1894
|
+
return Array.from(byRoot.values()).map((indices) => indices.map((idx) => items[idx]));
|
|
1895
|
+
}
|
|
1896
|
+
function pickClusterRepresentative(cluster) {
|
|
1897
|
+
if (cluster.length === 0) throw new Error("pickClusterRepresentative: cluster is empty");
|
|
1898
|
+
let best = cluster[0];
|
|
1899
|
+
for (let i = 1; i < cluster.length; i++) {
|
|
1900
|
+
if (cluster[i].length < best.length) best = cluster[i];
|
|
1901
|
+
}
|
|
1902
|
+
return best;
|
|
1903
|
+
}
|
|
1904
|
+
|
|
1905
|
+
// ../contracts/src/url-normalize.ts
|
|
1906
|
+
var STRIP_KEYS = /* @__PURE__ */ new Set([
|
|
1907
|
+
// Click identifiers
|
|
1908
|
+
"fbclid",
|
|
1909
|
+
"gclid",
|
|
1910
|
+
"msclkid",
|
|
1911
|
+
"ttclid",
|
|
1912
|
+
"li_fat_id",
|
|
1913
|
+
"igshid",
|
|
1914
|
+
"yclid",
|
|
1915
|
+
"dclid",
|
|
1916
|
+
"gbraid",
|
|
1917
|
+
"wbraid",
|
|
1918
|
+
"bingid",
|
|
1919
|
+
// Mailchimp
|
|
1920
|
+
"mc_cid",
|
|
1921
|
+
"mc_eid",
|
|
1922
|
+
// Google Analytics linkers
|
|
1923
|
+
"_ga",
|
|
1924
|
+
"_gl",
|
|
1925
|
+
// Google Tag Manager debug
|
|
1926
|
+
"gtm_latency",
|
|
1927
|
+
"gtm_debug",
|
|
1928
|
+
// WordPress internal noise
|
|
1929
|
+
"preview",
|
|
1930
|
+
"preview_id",
|
|
1931
|
+
"preview_nonce",
|
|
1932
|
+
"_thumbnail_id",
|
|
1933
|
+
// Common cache-busters/versioning
|
|
1934
|
+
"v",
|
|
1935
|
+
"ver",
|
|
1936
|
+
"version"
|
|
1937
|
+
]);
|
|
1938
|
+
function shouldStrip(key) {
|
|
1939
|
+
if (STRIP_KEYS.has(key)) return true;
|
|
1940
|
+
if (key.startsWith("utm_")) return true;
|
|
1941
|
+
return false;
|
|
1942
|
+
}
|
|
1943
|
+
function parseQuery(query) {
|
|
1944
|
+
if (query === "") return [];
|
|
1945
|
+
return query.split("&").map((pair) => {
|
|
1946
|
+
const eq = pair.indexOf("=");
|
|
1947
|
+
if (eq === -1) return { key: pair, value: null };
|
|
1948
|
+
return { key: pair.slice(0, eq), value: pair.slice(eq + 1) };
|
|
1949
|
+
});
|
|
1950
|
+
}
|
|
1951
|
+
function encodeQuery(pairs) {
|
|
1952
|
+
return pairs.map((p) => p.value === null ? p.key : `${p.key}=${p.value}`).join("&");
|
|
1953
|
+
}
|
|
1954
|
+
function collapseRootIndex(path) {
|
|
1955
|
+
if (path === "/index.html" || path === "/index.php") return "/";
|
|
1956
|
+
return path;
|
|
1957
|
+
}
|
|
1958
|
+
function dropTrailingSlash(path) {
|
|
1959
|
+
if (path.length > 1 && path.endsWith("/")) {
|
|
1960
|
+
return path.replace(/\/+$/, "");
|
|
1961
|
+
}
|
|
1962
|
+
return path;
|
|
1963
|
+
}
|
|
1964
|
+
function absolutizeProjectUrl(url, canonicalDomain) {
|
|
1965
|
+
if (!url) return "";
|
|
1966
|
+
const trimmed = url.trim();
|
|
1967
|
+
if (!trimmed) return "";
|
|
1968
|
+
if (/^https?:\/\//i.test(trimmed)) return trimmed;
|
|
1969
|
+
if (trimmed.startsWith("//")) return `https:${trimmed}`;
|
|
1970
|
+
const host = canonicalDomain.trim().replace(/^https?:\/\//i, "").replace(/\/+$/, "");
|
|
1971
|
+
if (!host) return trimmed;
|
|
1972
|
+
if (trimmed.startsWith("/")) return `https://${host}${trimmed}`;
|
|
1973
|
+
return `https://${host}/${trimmed}`;
|
|
1974
|
+
}
|
|
1975
|
+
function hostOf(value) {
|
|
1976
|
+
if (value == null) return null;
|
|
1977
|
+
const trimmed = value.trim();
|
|
1978
|
+
if (!trimmed) return null;
|
|
1979
|
+
try {
|
|
1980
|
+
const url = trimmed.includes("://") ? new URL(trimmed) : new URL(`https://${trimmed}`);
|
|
1981
|
+
return url.hostname.replace(/^www\./, "").toLowerCase();
|
|
1982
|
+
} catch {
|
|
1983
|
+
return null;
|
|
1984
|
+
}
|
|
1985
|
+
}
|
|
1986
|
+
function normalizeUrlPath(input) {
|
|
1987
|
+
if (input == null) return null;
|
|
1988
|
+
let trimmed = input.trim();
|
|
1989
|
+
if (trimmed === "") return null;
|
|
1990
|
+
trimmed = trimmed.replace(/ /g, " ").replace(/\s+/g, " ").trim();
|
|
1991
|
+
if (trimmed === "" || trimmed === "/") return "/";
|
|
1992
|
+
if (trimmed === "(not set)") return null;
|
|
1993
|
+
trimmed = trimmed.replace(/([a-z0-9])[).]+$/i, "$1");
|
|
1994
|
+
if (trimmed.startsWith("/)") || trimmed.startsWith("/ ")) {
|
|
1995
|
+
trimmed = "/";
|
|
1996
|
+
}
|
|
1997
|
+
if (trimmed.includes(" ")) {
|
|
1998
|
+
trimmed = trimmed.split(" ")[0];
|
|
1999
|
+
}
|
|
2000
|
+
if (trimmed === "" || trimmed === "/") return "/";
|
|
2001
|
+
let pathPart;
|
|
2002
|
+
let queryPart;
|
|
2003
|
+
if (/^https?:\/\//i.test(trimmed)) {
|
|
2004
|
+
let url;
|
|
2005
|
+
try {
|
|
2006
|
+
url = new URL(trimmed);
|
|
2007
|
+
} catch {
|
|
2008
|
+
return null;
|
|
2009
|
+
}
|
|
2010
|
+
pathPart = url.pathname || "/";
|
|
2011
|
+
queryPart = url.search.startsWith("?") ? url.search.slice(1) : url.search;
|
|
2012
|
+
} else {
|
|
2013
|
+
let raw = trimmed;
|
|
2014
|
+
const hashIdx = raw.indexOf("#");
|
|
2015
|
+
if (hashIdx !== -1) raw = raw.slice(0, hashIdx);
|
|
2016
|
+
const qIdx = raw.indexOf("?");
|
|
2017
|
+
if (qIdx === -1) {
|
|
2018
|
+
pathPart = raw;
|
|
2019
|
+
queryPart = "";
|
|
2020
|
+
} else {
|
|
2021
|
+
pathPart = raw.slice(0, qIdx);
|
|
2022
|
+
queryPart = raw.slice(qIdx + 1);
|
|
2023
|
+
}
|
|
2024
|
+
}
|
|
2025
|
+
if (pathPart === "") pathPart = "/";
|
|
2026
|
+
pathPart = collapseRootIndex(pathPart);
|
|
2027
|
+
pathPart = dropTrailingSlash(pathPart);
|
|
2028
|
+
const pairs = parseQuery(queryPart).filter((p) => !shouldStrip(p.key));
|
|
2029
|
+
pairs.sort((a, b) => {
|
|
2030
|
+
if (a.key < b.key) return -1;
|
|
2031
|
+
if (a.key > b.key) return 1;
|
|
2032
|
+
return 0;
|
|
2033
|
+
});
|
|
2034
|
+
if (pairs.length === 0) return pathPart;
|
|
2035
|
+
return `${pathPart}?${encodeQuery(pairs)}`;
|
|
2036
|
+
}
|
|
2037
|
+
|
|
2038
|
+
// ../contracts/src/discovery.ts
|
|
1835
2039
|
var discoveryBucketSchema = z17.enum(["cited", "aspirational", "wasted-surface"]);
|
|
1836
2040
|
var DiscoveryBuckets = discoveryBucketSchema.enum;
|
|
1837
2041
|
var DEFAULT_DISCOVERY_PROMOTE_BUCKETS = [
|
|
@@ -1963,6 +2167,231 @@ var queryProvenanceSchema = z17.union([
|
|
|
1963
2167
|
z17.literal("cli"),
|
|
1964
2168
|
z17.string().regex(/^discovery:.+$/)
|
|
1965
2169
|
]);
|
|
2170
|
+
var DISCOVERY_HARVEST_MAX_WORDS = 12;
|
|
2171
|
+
var DISCOVERY_HARVEST_MIN_CHARS = 3;
|
|
2172
|
+
var DISCOVERY_HARVEST_NOVELTY_THRESHOLD = DISCOVERY_DEFAULT_DEDUP_THRESHOLD;
|
|
2173
|
+
var DISCOVERY_HARVEST_MIN_ANCHOR_TERMS = 1;
|
|
2174
|
+
var DISCOVERY_HARVEST_PHONE_DIGITS = 7;
|
|
2175
|
+
var HARVEST_SIGNIFICANT_TOKEN_MIN = 4;
|
|
2176
|
+
var HARVEST_STOPWORDS = /* @__PURE__ */ new Set([
|
|
2177
|
+
"the",
|
|
2178
|
+
"and",
|
|
2179
|
+
"for",
|
|
2180
|
+
"with",
|
|
2181
|
+
"near",
|
|
2182
|
+
"best",
|
|
2183
|
+
"top",
|
|
2184
|
+
"your",
|
|
2185
|
+
"you",
|
|
2186
|
+
"are",
|
|
2187
|
+
"how",
|
|
2188
|
+
"what",
|
|
2189
|
+
"does",
|
|
2190
|
+
"this",
|
|
2191
|
+
"that",
|
|
2192
|
+
"from",
|
|
2193
|
+
"into",
|
|
2194
|
+
"about",
|
|
2195
|
+
"who",
|
|
2196
|
+
"why"
|
|
2197
|
+
]);
|
|
2198
|
+
var HARVEST_NAV_MARKERS = [
|
|
2199
|
+
"address",
|
|
2200
|
+
"directions",
|
|
2201
|
+
"hours",
|
|
2202
|
+
"login",
|
|
2203
|
+
"log in",
|
|
2204
|
+
"sign in",
|
|
2205
|
+
"signin",
|
|
2206
|
+
"phone number",
|
|
2207
|
+
"zip code",
|
|
2208
|
+
"postal code",
|
|
2209
|
+
"email address"
|
|
2210
|
+
];
|
|
2211
|
+
var discoveryHarvestCandidateSchema = z17.object({
|
|
2212
|
+
query: z17.string().min(1),
|
|
2213
|
+
probeHits: z17.number().int().positive()
|
|
2214
|
+
});
|
|
2215
|
+
var discoveryHarvestStatsSchema = z17.object({
|
|
2216
|
+
/** Distinct candidates extracted before gating. */
|
|
2217
|
+
rawCandidates: z17.number().int().nonnegative(),
|
|
2218
|
+
/** Candidates that passed every gate. */
|
|
2219
|
+
admitted: z17.number().int().nonnegative(),
|
|
2220
|
+
/** Per-reason rejection tally (each rejected candidate counted exactly once,
|
|
2221
|
+
* at the first gate it failed). Lexical-gate order: belowFloor → length →
|
|
2222
|
+
* navigational → duplicate (EXACT already-tracked) → offAnchor; then
|
|
2223
|
+
* `semanticDuplicate` for candidates dropped by the cosine novelty pass.
|
|
2224
|
+
* Invariant: `admitted + Σ(rejected) === rawCandidates`. */
|
|
2225
|
+
rejected: z17.object({
|
|
2226
|
+
belowFloor: z17.number().int().nonnegative(),
|
|
2227
|
+
length: z17.number().int().nonnegative(),
|
|
2228
|
+
navigational: z17.number().int().nonnegative(),
|
|
2229
|
+
/** Dropped by the cheap exact-match check against the tracked basket. */
|
|
2230
|
+
duplicate: z17.number().int().nonnegative(),
|
|
2231
|
+
offAnchor: z17.number().int().nonnegative(),
|
|
2232
|
+
/** Dropped by the embedding cosine novelty pass (a paraphrase / synonym /
|
|
2233
|
+
* stem variant of a tracked query that exact match can't see). 0 when the
|
|
2234
|
+
* semantic pass did not run — see `semanticNoveltyApplied`. */
|
|
2235
|
+
semanticDuplicate: z17.number().int().nonnegative()
|
|
2236
|
+
})
|
|
2237
|
+
});
|
|
2238
|
+
var discoveryHarvestDtoSchema = z17.object({
|
|
2239
|
+
sessionId: z17.string(),
|
|
2240
|
+
projectId: z17.string(),
|
|
2241
|
+
/** The provider whose probes were harvested (the session's seed provider).
|
|
2242
|
+
* Discovery is Gemini-only today; carried so a future multi-provider
|
|
2243
|
+
* discovery can attribute candidates. */
|
|
2244
|
+
provider: z17.string(),
|
|
2245
|
+
status: discoverySessionStatusSchema,
|
|
2246
|
+
/** Recurrence floor applied: a candidate must have appeared in ≥ this many
|
|
2247
|
+
* distinct probes to be admitted. */
|
|
2248
|
+
minProbeHits: z17.number().int().positive(),
|
|
2249
|
+
/** Whether the subject-anchor filter actually ran (requested AND the corpus
|
|
2250
|
+
* had ≥ `DISCOVERY_HARVEST_MIN_ANCHOR_TERMS` significant terms). */
|
|
2251
|
+
anchorApplied: z17.boolean(),
|
|
2252
|
+
/** Whether the embedding cosine novelty pass ran. False when embeddings were
|
|
2253
|
+
* unavailable (no Gemini key / no tracked queries / no candidates), in which
|
|
2254
|
+
* case novelty fell back to the cheap exact-match check only. */
|
|
2255
|
+
semanticNoveltyApplied: z17.boolean(),
|
|
2256
|
+
candidates: z17.array(discoveryHarvestCandidateSchema),
|
|
2257
|
+
stats: discoveryHarvestStatsSchema
|
|
2258
|
+
});
|
|
2259
|
+
function normalizeHarvestQuery(query) {
|
|
2260
|
+
return query.trim().replace(/\s+/g, " ").toLowerCase();
|
|
2261
|
+
}
|
|
2262
|
+
function harvestTokens(query) {
|
|
2263
|
+
return normalizeHarvestQuery(query).split(/[^a-z0-9]+/).filter(Boolean);
|
|
2264
|
+
}
|
|
2265
|
+
function significantHarvestTokens(query) {
|
|
2266
|
+
return harvestTokens(query).filter(
|
|
2267
|
+
(t) => t.length >= HARVEST_SIGNIFICANT_TOKEN_MIN && !HARVEST_STOPWORDS.has(t)
|
|
2268
|
+
);
|
|
2269
|
+
}
|
|
2270
|
+
function longestDigitRun(query) {
|
|
2271
|
+
let max = 0;
|
|
2272
|
+
let run = 0;
|
|
2273
|
+
for (const ch of query) {
|
|
2274
|
+
if (ch >= "0" && ch <= "9") {
|
|
2275
|
+
run++;
|
|
2276
|
+
if (run > max) max = run;
|
|
2277
|
+
} else {
|
|
2278
|
+
run = 0;
|
|
2279
|
+
}
|
|
2280
|
+
}
|
|
2281
|
+
return max;
|
|
2282
|
+
}
|
|
2283
|
+
function isNavigationalHarvestQuery(query) {
|
|
2284
|
+
const norm = normalizeHarvestQuery(query);
|
|
2285
|
+
if (longestDigitRun(norm) >= DISCOVERY_HARVEST_PHONE_DIGITS) return true;
|
|
2286
|
+
const tokens = new Set(harvestTokens(norm));
|
|
2287
|
+
for (const marker of HARVEST_NAV_MARKERS) {
|
|
2288
|
+
if (marker.includes(" ")) {
|
|
2289
|
+
const re = new RegExp(`(?<![a-z0-9])${marker.replace(/ /g, "\\s+")}(?![a-z0-9])`);
|
|
2290
|
+
if (re.test(norm)) return true;
|
|
2291
|
+
} else if (tokens.has(marker)) {
|
|
2292
|
+
return true;
|
|
2293
|
+
}
|
|
2294
|
+
}
|
|
2295
|
+
return false;
|
|
2296
|
+
}
|
|
2297
|
+
function buildHarvestAnchorTerms(corpus, domains = []) {
|
|
2298
|
+
const set = /* @__PURE__ */ new Set();
|
|
2299
|
+
for (const text of corpus) {
|
|
2300
|
+
for (const token of significantHarvestTokens(text)) set.add(token);
|
|
2301
|
+
}
|
|
2302
|
+
for (const domain of domains) {
|
|
2303
|
+
const host = hostOf(domain);
|
|
2304
|
+
if (!host) continue;
|
|
2305
|
+
const label = host.replace(/\.[a-z0-9]+$/, "");
|
|
2306
|
+
for (const token of significantHarvestTokens(label)) set.add(token);
|
|
2307
|
+
}
|
|
2308
|
+
return [...set];
|
|
2309
|
+
}
|
|
2310
|
+
function gateHarvestedSearchQueries(input) {
|
|
2311
|
+
const minProbeHits = Math.max(1, Math.floor(input.minProbeHits ?? 1));
|
|
2312
|
+
const anchorTermSet = new Set(input.anchorTerms ?? []);
|
|
2313
|
+
const applyAnchor = (input.applyAnchor ?? true) && anchorTermSet.size >= DISCOVERY_HARVEST_MIN_ANCHOR_TERMS;
|
|
2314
|
+
const trackedNorm = new Set(
|
|
2315
|
+
input.trackedQueries.map(normalizeHarvestQuery).filter(Boolean)
|
|
2316
|
+
);
|
|
2317
|
+
const stats = {
|
|
2318
|
+
rawCandidates: input.candidates.length,
|
|
2319
|
+
admitted: 0,
|
|
2320
|
+
rejected: { belowFloor: 0, length: 0, navigational: 0, duplicate: 0, offAnchor: 0, semanticDuplicate: 0 }
|
|
2321
|
+
};
|
|
2322
|
+
const admitted = [];
|
|
2323
|
+
for (const candidate of input.candidates) {
|
|
2324
|
+
if (candidate.probeHits < minProbeHits) {
|
|
2325
|
+
stats.rejected.belowFloor++;
|
|
2326
|
+
continue;
|
|
2327
|
+
}
|
|
2328
|
+
const norm = normalizeHarvestQuery(candidate.query);
|
|
2329
|
+
const words = norm ? norm.split(" ").length : 0;
|
|
2330
|
+
if (norm.length < DISCOVERY_HARVEST_MIN_CHARS || words > DISCOVERY_HARVEST_MAX_WORDS) {
|
|
2331
|
+
stats.rejected.length++;
|
|
2332
|
+
continue;
|
|
2333
|
+
}
|
|
2334
|
+
if (isNavigationalHarvestQuery(norm)) {
|
|
2335
|
+
stats.rejected.navigational++;
|
|
2336
|
+
continue;
|
|
2337
|
+
}
|
|
2338
|
+
if (trackedNorm.has(norm)) {
|
|
2339
|
+
stats.rejected.duplicate++;
|
|
2340
|
+
continue;
|
|
2341
|
+
}
|
|
2342
|
+
if (applyAnchor) {
|
|
2343
|
+
const sig = significantHarvestTokens(norm);
|
|
2344
|
+
if (!sig.some((t) => anchorTermSet.has(t))) {
|
|
2345
|
+
stats.rejected.offAnchor++;
|
|
2346
|
+
continue;
|
|
2347
|
+
}
|
|
2348
|
+
}
|
|
2349
|
+
admitted.push({ query: norm, probeHits: candidate.probeHits });
|
|
2350
|
+
stats.admitted++;
|
|
2351
|
+
}
|
|
2352
|
+
admitted.sort((a, b) => b.probeHits - a.probeHits || a.query.localeCompare(b.query));
|
|
2353
|
+
return { admitted, anchorApplied: applyAnchor, stats };
|
|
2354
|
+
}
|
|
2355
|
+
function applyHarvestSemanticNovelty(input) {
|
|
2356
|
+
const { result, candidateVectors, trackedVectors } = input;
|
|
2357
|
+
const threshold = input.threshold ?? DISCOVERY_HARVEST_NOVELTY_THRESHOLD;
|
|
2358
|
+
if (candidateVectors.length !== result.admitted.length || trackedVectors.length === 0) {
|
|
2359
|
+
return result;
|
|
2360
|
+
}
|
|
2361
|
+
const admitted = [];
|
|
2362
|
+
let semanticDuplicate = 0;
|
|
2363
|
+
for (let i = 0; i < result.admitted.length; i++) {
|
|
2364
|
+
const vec = candidateVectors[i];
|
|
2365
|
+
const isDup = trackedVectors.some((t) => cosineSimilarity(vec, t) >= threshold);
|
|
2366
|
+
if (isDup) semanticDuplicate++;
|
|
2367
|
+
else admitted.push(result.admitted[i]);
|
|
2368
|
+
}
|
|
2369
|
+
return {
|
|
2370
|
+
admitted,
|
|
2371
|
+
anchorApplied: result.anchorApplied,
|
|
2372
|
+
stats: {
|
|
2373
|
+
...result.stats,
|
|
2374
|
+
admitted: admitted.length,
|
|
2375
|
+
rejected: { ...result.stats.rejected, semanticDuplicate }
|
|
2376
|
+
}
|
|
2377
|
+
};
|
|
2378
|
+
}
|
|
2379
|
+
function aggregateHarvestedQueries(probes) {
|
|
2380
|
+
const counts = /* @__PURE__ */ new Map();
|
|
2381
|
+
for (const probe of probes) {
|
|
2382
|
+
const seenInProbe = /* @__PURE__ */ new Set();
|
|
2383
|
+
for (const raw of probe.searchQueries) {
|
|
2384
|
+
if (typeof raw !== "string") continue;
|
|
2385
|
+
const norm = normalizeHarvestQuery(raw);
|
|
2386
|
+
if (!norm || seenInProbe.has(norm)) continue;
|
|
2387
|
+
seenInProbe.add(norm);
|
|
2388
|
+
const existing = counts.get(norm);
|
|
2389
|
+
if (existing) existing.probeHits++;
|
|
2390
|
+
else counts.set(norm, { query: norm, probeHits: 1 });
|
|
2391
|
+
}
|
|
2392
|
+
}
|
|
2393
|
+
return [...counts.values()];
|
|
2394
|
+
}
|
|
1966
2395
|
|
|
1967
2396
|
// ../contracts/src/surface-class.ts
|
|
1968
2397
|
var surfaceClassSchema = z18.enum([
|
|
@@ -3104,139 +3533,6 @@ function summarizeCheckResults(results) {
|
|
|
3104
3533
|
return summary;
|
|
3105
3534
|
}
|
|
3106
3535
|
|
|
3107
|
-
// ../contracts/src/url-normalize.ts
|
|
3108
|
-
var STRIP_KEYS = /* @__PURE__ */ new Set([
|
|
3109
|
-
// Click identifiers
|
|
3110
|
-
"fbclid",
|
|
3111
|
-
"gclid",
|
|
3112
|
-
"msclkid",
|
|
3113
|
-
"ttclid",
|
|
3114
|
-
"li_fat_id",
|
|
3115
|
-
"igshid",
|
|
3116
|
-
"yclid",
|
|
3117
|
-
"dclid",
|
|
3118
|
-
"gbraid",
|
|
3119
|
-
"wbraid",
|
|
3120
|
-
"bingid",
|
|
3121
|
-
// Mailchimp
|
|
3122
|
-
"mc_cid",
|
|
3123
|
-
"mc_eid",
|
|
3124
|
-
// Google Analytics linkers
|
|
3125
|
-
"_ga",
|
|
3126
|
-
"_gl",
|
|
3127
|
-
// Google Tag Manager debug
|
|
3128
|
-
"gtm_latency",
|
|
3129
|
-
"gtm_debug",
|
|
3130
|
-
// WordPress internal noise
|
|
3131
|
-
"preview",
|
|
3132
|
-
"preview_id",
|
|
3133
|
-
"preview_nonce",
|
|
3134
|
-
"_thumbnail_id",
|
|
3135
|
-
// Common cache-busters/versioning
|
|
3136
|
-
"v",
|
|
3137
|
-
"ver",
|
|
3138
|
-
"version"
|
|
3139
|
-
]);
|
|
3140
|
-
function shouldStrip(key) {
|
|
3141
|
-
if (STRIP_KEYS.has(key)) return true;
|
|
3142
|
-
if (key.startsWith("utm_")) return true;
|
|
3143
|
-
return false;
|
|
3144
|
-
}
|
|
3145
|
-
function parseQuery(query) {
|
|
3146
|
-
if (query === "") return [];
|
|
3147
|
-
return query.split("&").map((pair) => {
|
|
3148
|
-
const eq = pair.indexOf("=");
|
|
3149
|
-
if (eq === -1) return { key: pair, value: null };
|
|
3150
|
-
return { key: pair.slice(0, eq), value: pair.slice(eq + 1) };
|
|
3151
|
-
});
|
|
3152
|
-
}
|
|
3153
|
-
function encodeQuery(pairs) {
|
|
3154
|
-
return pairs.map((p) => p.value === null ? p.key : `${p.key}=${p.value}`).join("&");
|
|
3155
|
-
}
|
|
3156
|
-
function collapseRootIndex(path) {
|
|
3157
|
-
if (path === "/index.html" || path === "/index.php") return "/";
|
|
3158
|
-
return path;
|
|
3159
|
-
}
|
|
3160
|
-
function dropTrailingSlash(path) {
|
|
3161
|
-
if (path.length > 1 && path.endsWith("/")) {
|
|
3162
|
-
return path.replace(/\/+$/, "");
|
|
3163
|
-
}
|
|
3164
|
-
return path;
|
|
3165
|
-
}
|
|
3166
|
-
function absolutizeProjectUrl(url, canonicalDomain) {
|
|
3167
|
-
if (!url) return "";
|
|
3168
|
-
const trimmed = url.trim();
|
|
3169
|
-
if (!trimmed) return "";
|
|
3170
|
-
if (/^https?:\/\//i.test(trimmed)) return trimmed;
|
|
3171
|
-
if (trimmed.startsWith("//")) return `https:${trimmed}`;
|
|
3172
|
-
const host = canonicalDomain.trim().replace(/^https?:\/\//i, "").replace(/\/+$/, "");
|
|
3173
|
-
if (!host) return trimmed;
|
|
3174
|
-
if (trimmed.startsWith("/")) return `https://${host}${trimmed}`;
|
|
3175
|
-
return `https://${host}/${trimmed}`;
|
|
3176
|
-
}
|
|
3177
|
-
function hostOf(value) {
|
|
3178
|
-
if (value == null) return null;
|
|
3179
|
-
const trimmed = value.trim();
|
|
3180
|
-
if (!trimmed) return null;
|
|
3181
|
-
try {
|
|
3182
|
-
const url = trimmed.includes("://") ? new URL(trimmed) : new URL(`https://${trimmed}`);
|
|
3183
|
-
return url.hostname.replace(/^www\./, "").toLowerCase();
|
|
3184
|
-
} catch {
|
|
3185
|
-
return null;
|
|
3186
|
-
}
|
|
3187
|
-
}
|
|
3188
|
-
function normalizeUrlPath(input) {
|
|
3189
|
-
if (input == null) return null;
|
|
3190
|
-
let trimmed = input.trim();
|
|
3191
|
-
if (trimmed === "") return null;
|
|
3192
|
-
trimmed = trimmed.replace(/ /g, " ").replace(/\s+/g, " ").trim();
|
|
3193
|
-
if (trimmed === "" || trimmed === "/") return "/";
|
|
3194
|
-
if (trimmed === "(not set)") return null;
|
|
3195
|
-
trimmed = trimmed.replace(/([a-z0-9])[).]+$/i, "$1");
|
|
3196
|
-
if (trimmed.startsWith("/)") || trimmed.startsWith("/ ")) {
|
|
3197
|
-
trimmed = "/";
|
|
3198
|
-
}
|
|
3199
|
-
if (trimmed.includes(" ")) {
|
|
3200
|
-
trimmed = trimmed.split(" ")[0];
|
|
3201
|
-
}
|
|
3202
|
-
if (trimmed === "" || trimmed === "/") return "/";
|
|
3203
|
-
let pathPart;
|
|
3204
|
-
let queryPart;
|
|
3205
|
-
if (/^https?:\/\//i.test(trimmed)) {
|
|
3206
|
-
let url;
|
|
3207
|
-
try {
|
|
3208
|
-
url = new URL(trimmed);
|
|
3209
|
-
} catch {
|
|
3210
|
-
return null;
|
|
3211
|
-
}
|
|
3212
|
-
pathPart = url.pathname || "/";
|
|
3213
|
-
queryPart = url.search.startsWith("?") ? url.search.slice(1) : url.search;
|
|
3214
|
-
} else {
|
|
3215
|
-
let raw = trimmed;
|
|
3216
|
-
const hashIdx = raw.indexOf("#");
|
|
3217
|
-
if (hashIdx !== -1) raw = raw.slice(0, hashIdx);
|
|
3218
|
-
const qIdx = raw.indexOf("?");
|
|
3219
|
-
if (qIdx === -1) {
|
|
3220
|
-
pathPart = raw;
|
|
3221
|
-
queryPart = "";
|
|
3222
|
-
} else {
|
|
3223
|
-
pathPart = raw.slice(0, qIdx);
|
|
3224
|
-
queryPart = raw.slice(qIdx + 1);
|
|
3225
|
-
}
|
|
3226
|
-
}
|
|
3227
|
-
if (pathPart === "") pathPart = "/";
|
|
3228
|
-
pathPart = collapseRootIndex(pathPart);
|
|
3229
|
-
pathPart = dropTrailingSlash(pathPart);
|
|
3230
|
-
const pairs = parseQuery(queryPart).filter((p) => !shouldStrip(p.key));
|
|
3231
|
-
pairs.sort((a, b) => {
|
|
3232
|
-
if (a.key < b.key) return -1;
|
|
3233
|
-
if (a.key > b.key) return 1;
|
|
3234
|
-
return 0;
|
|
3235
|
-
});
|
|
3236
|
-
if (pairs.length === 0) return pathPart;
|
|
3237
|
-
return `${pathPart}?${encodeQuery(pairs)}`;
|
|
3238
|
-
}
|
|
3239
|
-
|
|
3240
3536
|
// ../contracts/src/citations.ts
|
|
3241
3537
|
import { z as z27 } from "zod";
|
|
3242
3538
|
var citationCoverageProviderSchema = z27.object({
|
|
@@ -4293,75 +4589,6 @@ var trafficEventsResponseSchema = z30.object({
|
|
|
4293
4589
|
events: z30.array(trafficEventEntrySchema)
|
|
4294
4590
|
});
|
|
4295
4591
|
|
|
4296
|
-
// ../contracts/src/embeddings.ts
|
|
4297
|
-
function cosineSimilarity(a, b) {
|
|
4298
|
-
if (a.length === 0 || b.length === 0) {
|
|
4299
|
-
throw new Error("cosineSimilarity: vectors must be non-empty");
|
|
4300
|
-
}
|
|
4301
|
-
if (a.length !== b.length) {
|
|
4302
|
-
throw new Error(`cosineSimilarity: vector length mismatch (${a.length} vs ${b.length})`);
|
|
4303
|
-
}
|
|
4304
|
-
let dot = 0;
|
|
4305
|
-
let magA = 0;
|
|
4306
|
-
let magB = 0;
|
|
4307
|
-
for (let i = 0; i < a.length; i++) {
|
|
4308
|
-
dot += a[i] * b[i];
|
|
4309
|
-
magA += a[i] * a[i];
|
|
4310
|
-
magB += b[i] * b[i];
|
|
4311
|
-
}
|
|
4312
|
-
if (magA === 0 || magB === 0) return 0;
|
|
4313
|
-
return dot / (Math.sqrt(magA) * Math.sqrt(magB));
|
|
4314
|
-
}
|
|
4315
|
-
function clusterByCosine(items, vectors, threshold) {
|
|
4316
|
-
if (threshold < 0 || threshold > 1) {
|
|
4317
|
-
throw new Error(`clusterByCosine: threshold must be in [0, 1], got ${threshold}`);
|
|
4318
|
-
}
|
|
4319
|
-
if (items.length !== vectors.length) {
|
|
4320
|
-
throw new Error(`clusterByCosine: items/vectors length mismatch (${items.length} vs ${vectors.length})`);
|
|
4321
|
-
}
|
|
4322
|
-
if (items.length === 0) return [];
|
|
4323
|
-
const parent = items.map((_, i) => i);
|
|
4324
|
-
const find = (x) => {
|
|
4325
|
-
let root = x;
|
|
4326
|
-
while (parent[root] !== root) root = parent[root];
|
|
4327
|
-
let cur = x;
|
|
4328
|
-
while (parent[cur] !== root) {
|
|
4329
|
-
const next = parent[cur];
|
|
4330
|
-
parent[cur] = root;
|
|
4331
|
-
cur = next;
|
|
4332
|
-
}
|
|
4333
|
-
return root;
|
|
4334
|
-
};
|
|
4335
|
-
const union = (a, b) => {
|
|
4336
|
-
const ra = find(a);
|
|
4337
|
-
const rb = find(b);
|
|
4338
|
-
if (ra !== rb) parent[ra] = rb;
|
|
4339
|
-
};
|
|
4340
|
-
for (let i = 0; i < items.length; i++) {
|
|
4341
|
-
for (let j = i + 1; j < items.length; j++) {
|
|
4342
|
-
if (cosineSimilarity(vectors[i], vectors[j]) >= threshold) {
|
|
4343
|
-
union(i, j);
|
|
4344
|
-
}
|
|
4345
|
-
}
|
|
4346
|
-
}
|
|
4347
|
-
const byRoot = /* @__PURE__ */ new Map();
|
|
4348
|
-
for (let i = 0; i < items.length; i++) {
|
|
4349
|
-
const root = find(i);
|
|
4350
|
-
const existing = byRoot.get(root);
|
|
4351
|
-
if (existing) existing.push(i);
|
|
4352
|
-
else byRoot.set(root, [i]);
|
|
4353
|
-
}
|
|
4354
|
-
return Array.from(byRoot.values()).map((indices) => indices.map((idx) => items[idx]));
|
|
4355
|
-
}
|
|
4356
|
-
function pickClusterRepresentative(cluster) {
|
|
4357
|
-
if (cluster.length === 0) throw new Error("pickClusterRepresentative: cluster is empty");
|
|
4358
|
-
let best = cluster[0];
|
|
4359
|
-
for (let i = 1; i < cluster.length; i++) {
|
|
4360
|
-
if (cluster[i].length < best.length) best = cluster[i];
|
|
4361
|
-
}
|
|
4362
|
-
return best;
|
|
4363
|
-
}
|
|
4364
|
-
|
|
4365
4592
|
// ../contracts/src/formatting.ts
|
|
4366
4593
|
function formatRatio(value) {
|
|
4367
4594
|
if (!Number.isFinite(value) || value === 0) return "0%";
|
|
@@ -4736,6 +4963,11 @@ export {
|
|
|
4736
4963
|
categorizeSource,
|
|
4737
4964
|
categorizeSourceWithCompetitors,
|
|
4738
4965
|
categoryLabel,
|
|
4966
|
+
clusterByCosine,
|
|
4967
|
+
pickClusterRepresentative,
|
|
4968
|
+
absolutizeProjectUrl,
|
|
4969
|
+
hostOf,
|
|
4970
|
+
normalizeUrlPath,
|
|
4739
4971
|
discoveryBucketSchema,
|
|
4740
4972
|
DiscoveryBuckets,
|
|
4741
4973
|
DEFAULT_DISCOVERY_PROMOTE_BUCKETS,
|
|
@@ -4754,6 +4986,11 @@ export {
|
|
|
4754
4986
|
discoveryPromoteRequestSchema,
|
|
4755
4987
|
discoveryPromotePreviewSchema,
|
|
4756
4988
|
discoveryPromoteResultSchema,
|
|
4989
|
+
discoveryHarvestDtoSchema,
|
|
4990
|
+
buildHarvestAnchorTerms,
|
|
4991
|
+
gateHarvestedSearchQueries,
|
|
4992
|
+
applyHarvestSemanticNovelty,
|
|
4993
|
+
aggregateHarvestedQueries,
|
|
4757
4994
|
surfaceClassLabel,
|
|
4758
4995
|
surfaceClassFromCompetitorType,
|
|
4759
4996
|
classifySurfaceFromCategory,
|
|
@@ -4815,9 +5052,6 @@ export {
|
|
|
4815
5052
|
CheckCategories,
|
|
4816
5053
|
doctorReportSchema,
|
|
4817
5054
|
summarizeCheckResults,
|
|
4818
|
-
absolutizeProjectUrl,
|
|
4819
|
-
hostOf,
|
|
4820
|
-
normalizeUrlPath,
|
|
4821
5055
|
citationVisibilityResponseSchema,
|
|
4822
5056
|
emptyCitationVisibility,
|
|
4823
5057
|
citationStateToCited,
|
|
@@ -4854,8 +5088,6 @@ export {
|
|
|
4854
5088
|
trafficEventKindSchema,
|
|
4855
5089
|
TrafficEventKinds,
|
|
4856
5090
|
trafficEventsResponseSchema,
|
|
4857
|
-
clusterByCosine,
|
|
4858
|
-
pickClusterRepresentative,
|
|
4859
5091
|
formatRatio,
|
|
4860
5092
|
formatNumber,
|
|
4861
5093
|
formatDate,
|