wayfind 2.0.36 → 2.0.37
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/bin/content-store.js +108 -0
- package/package.json +1 -1
package/bin/content-store.js
CHANGED
|
@@ -1852,6 +1852,114 @@ async function indexSignals(options = {}) {
|
|
|
1852
1852
|
}
|
|
1853
1853
|
}
|
|
1854
1854
|
|
|
1855
|
+
// ── Chunk long signal entries for better embedding retrieval ──────────────
|
|
1856
|
+
// Split signal content by ## headings into section-level entries.
|
|
1857
|
+
// Each chunk gets its own embedding so semantic search matches at section level.
|
|
1858
|
+
const MIN_CHUNK_CHARS = 200;
|
|
1859
|
+
const MAX_CHUNK_CHARS = 3000;
|
|
1860
|
+
|
|
1861
|
+
// Collect all signal files across all channels for chunking
|
|
1862
|
+
const allSignalFiles = [];
|
|
1863
|
+
for (const ch of channels) {
|
|
1864
|
+
const chDir = path.join(signalsDir, ch);
|
|
1865
|
+
try {
|
|
1866
|
+
const entries = fs.readdirSync(chDir, { withFileTypes: true });
|
|
1867
|
+
for (const e of entries) {
|
|
1868
|
+
if (e.isFile() && e.name.endsWith('.md')) {
|
|
1869
|
+
allSignalFiles.push({ filePath: path.join(chDir, e.name), file: e.name, repo: 'signals/' + ch, channel: ch });
|
|
1870
|
+
}
|
|
1871
|
+
}
|
|
1872
|
+
for (const ownerEntry of entries) {
|
|
1873
|
+
if (!ownerEntry.isDirectory()) continue;
|
|
1874
|
+
const ownerDir = path.join(chDir, ownerEntry.name);
|
|
1875
|
+
let repoEntries;
|
|
1876
|
+
try { repoEntries = fs.readdirSync(ownerDir, { withFileTypes: true }); } catch { continue; }
|
|
1877
|
+
for (const repoEntry of repoEntries) {
|
|
1878
|
+
if (!repoEntry.isDirectory()) continue;
|
|
1879
|
+
const repoDir = path.join(ownerDir, repoEntry.name);
|
|
1880
|
+
let repoFiles;
|
|
1881
|
+
try { repoFiles = fs.readdirSync(repoDir).filter(f => f.endsWith('.md')); } catch { continue; }
|
|
1882
|
+
for (const f of repoFiles) {
|
|
1883
|
+
allSignalFiles.push({ filePath: path.join(repoDir, f), file: f, repo: `${ownerEntry.name}/${repoEntry.name}`, channel: ch });
|
|
1884
|
+
}
|
|
1885
|
+
}
|
|
1886
|
+
}
|
|
1887
|
+
} catch { continue; }
|
|
1888
|
+
}
|
|
1889
|
+
|
|
1890
|
+
for (const { filePath, file, repo, channel: ch } of allSignalFiles) {
|
|
1891
|
+
let content;
|
|
1892
|
+
try {
|
|
1893
|
+
content = fs.readFileSync(filePath, 'utf8');
|
|
1894
|
+
} catch {
|
|
1895
|
+
continue;
|
|
1896
|
+
}
|
|
1897
|
+
if (content.length < MIN_CHUNK_CHARS * 2) continue; // Too short to chunk
|
|
1898
|
+
|
|
1899
|
+
const dateMatch = file.match(/^(\d{4}-\d{2}-\d{2})/);
|
|
1900
|
+
const date = dateMatch ? dateMatch[1] : file.replace(/\.md$/, '');
|
|
1901
|
+
const titleMatch = content.match(/^#\s+(.+)$/m);
|
|
1902
|
+
const parentTitle = titleMatch ? titleMatch[1].trim() : file.replace(/\.md$/, '');
|
|
1903
|
+
const parentId = generateEntryId(date, repo, file.replace(/\.md$/, ''));
|
|
1904
|
+
|
|
1905
|
+
// Split by ## headings
|
|
1906
|
+
const sections = content.split(/^(?=##\s)/m).filter(s => s.trim().length >= MIN_CHUNK_CHARS);
|
|
1907
|
+
if (sections.length <= 1) continue; // Only one section — parent embedding is sufficient
|
|
1908
|
+
|
|
1909
|
+
for (let i = 0; i < sections.length; i++) {
|
|
1910
|
+
let section = sections[i];
|
|
1911
|
+
const headingMatch = section.match(/^##\s+(.+)$/m);
|
|
1912
|
+
const sectionTitle = headingMatch ? headingMatch[1].trim() : `Section ${i + 1}`;
|
|
1913
|
+
const chunkTitle = `${parentTitle} — ${sectionTitle}`;
|
|
1914
|
+
|
|
1915
|
+
if (section.length > MAX_CHUNK_CHARS) {
|
|
1916
|
+
section = section.slice(0, MAX_CHUNK_CHARS);
|
|
1917
|
+
}
|
|
1918
|
+
|
|
1919
|
+
const chunkId = generateEntryId(date, repo, `chunk-${i}-${file.replace(/\.md$/, '')}`);
|
|
1920
|
+
const chunkHash = contentHash(section);
|
|
1921
|
+
const existingChunk = existingIndex.entries[chunkId];
|
|
1922
|
+
|
|
1923
|
+
if (existingChunk && existingChunk.contentHash === chunkHash) {
|
|
1924
|
+
if (doEmbeddings && !existingChunk.hasEmbedding) {
|
|
1925
|
+
try {
|
|
1926
|
+
const vec = await llm.generateEmbedding(section);
|
|
1927
|
+
existingEmbeddings[chunkId] = vec;
|
|
1928
|
+
existingChunk.hasEmbedding = true;
|
|
1929
|
+
} catch {
|
|
1930
|
+
// Skip
|
|
1931
|
+
}
|
|
1932
|
+
}
|
|
1933
|
+
continue;
|
|
1934
|
+
}
|
|
1935
|
+
|
|
1936
|
+
existingIndex.entries[chunkId] = {
|
|
1937
|
+
date,
|
|
1938
|
+
repo,
|
|
1939
|
+
title: chunkTitle,
|
|
1940
|
+
source: 'signal-chunk',
|
|
1941
|
+
parentId,
|
|
1942
|
+
chunkIndex: i,
|
|
1943
|
+
user: '',
|
|
1944
|
+
drifted: false,
|
|
1945
|
+
contentHash: chunkHash,
|
|
1946
|
+
contentLength: section.length,
|
|
1947
|
+
tags: [ch, sectionTitle.toLowerCase()],
|
|
1948
|
+
hasEmbedding: false,
|
|
1949
|
+
};
|
|
1950
|
+
|
|
1951
|
+
if (doEmbeddings) {
|
|
1952
|
+
try {
|
|
1953
|
+
const vec = await llm.generateEmbedding(section);
|
|
1954
|
+
existingEmbeddings[chunkId] = vec;
|
|
1955
|
+
existingIndex.entries[chunkId].hasEmbedding = true;
|
|
1956
|
+
} catch {
|
|
1957
|
+
// Continue without embedding
|
|
1958
|
+
}
|
|
1959
|
+
}
|
|
1960
|
+
}
|
|
1961
|
+
}
|
|
1962
|
+
|
|
1855
1963
|
// Save
|
|
1856
1964
|
existingIndex.entryCount = Object.keys(existingIndex.entries).length;
|
|
1857
1965
|
backend.saveIndex(existingIndex);
|
package/package.json
CHANGED