@deepagents/retrieval 0.1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/index.d.ts +8 -0
- package/dist/index.d.ts.map +1 -0
- package/dist/index.js +8 -0
- package/dist/index.js.map +7 -0
- package/dist/lib/connectors/connector.d.ts +25 -0
- package/dist/lib/connectors/connector.d.ts.map +1 -0
- package/dist/lib/connectors/connector.js +1 -0
- package/dist/lib/connectors/connector.js.map +7 -0
- package/dist/lib/connectors/github.d.ts +35 -0
- package/dist/lib/connectors/github.d.ts.map +1 -0
- package/dist/lib/connectors/github.js +155 -0
- package/dist/lib/connectors/github.js.map +7 -0
- package/dist/lib/connectors/index.d.ts +8 -0
- package/dist/lib/connectors/index.d.ts.map +1 -0
- package/dist/lib/connectors/index.js +8 -0
- package/dist/lib/connectors/index.js.map +7 -0
- package/dist/lib/connectors/linear.d.ts +9 -0
- package/dist/lib/connectors/linear.d.ts.map +1 -0
- package/dist/lib/connectors/linear.js +29 -0
- package/dist/lib/connectors/linear.js.map +7 -0
- package/dist/lib/connectors/local.d.ts +7 -0
- package/dist/lib/connectors/local.d.ts.map +1 -0
- package/dist/lib/connectors/local.js +98 -0
- package/dist/lib/connectors/local.js.map +7 -0
- package/dist/lib/connectors/pdf.d.ts +4 -0
- package/dist/lib/connectors/pdf.d.ts.map +1 -0
- package/dist/lib/connectors/pdf.js +58 -0
- package/dist/lib/connectors/pdf.js.map +7 -0
- package/dist/lib/connectors/repo.d.ts +6 -0
- package/dist/lib/connectors/repo.d.ts.map +1 -0
- package/dist/lib/connectors/repo.js +171 -0
- package/dist/lib/connectors/repo.js.map +7 -0
- package/dist/lib/connectors/rss.d.ts +12 -0
- package/dist/lib/connectors/rss.d.ts.map +1 -0
- package/dist/lib/connectors/rss.js +136 -0
- package/dist/lib/connectors/rss.js.map +7 -0
- package/dist/lib/embedders/fastembed.d.ts +11 -0
- package/dist/lib/embedders/fastembed.d.ts.map +1 -0
- package/dist/lib/embedders/fastembed.js +35 -0
- package/dist/lib/embedders/fastembed.js.map +7 -0
- package/dist/lib/embedders/huggingface.d.ts +14 -0
- package/dist/lib/embedders/huggingface.d.ts.map +1 -0
- package/dist/lib/embedders/huggingface.js +40 -0
- package/dist/lib/embedders/huggingface.js.map +7 -0
- package/dist/lib/ingest.d.ts +24 -0
- package/dist/lib/ingest.d.ts.map +1 -0
- package/dist/lib/ingest.js +111 -0
- package/dist/lib/ingest.js.map +7 -0
- package/dist/lib/pipeline.d.ts +2 -0
- package/dist/lib/pipeline.d.ts.map +1 -0
- package/dist/lib/pipeline.js +1 -0
- package/dist/lib/pipeline.js.map +7 -0
- package/dist/lib/sidecar.d.ts +1 -0
- package/dist/lib/sidecar.d.ts.map +1 -0
- package/dist/lib/sidecar.js +1 -0
- package/dist/lib/sidecar.js.map +7 -0
- package/dist/lib/similiarty-search.d.ts +3 -0
- package/dist/lib/similiarty-search.d.ts.map +1 -0
- package/dist/lib/similiarty-search.js +43 -0
- package/dist/lib/similiarty-search.js.map +7 -0
- package/dist/lib/stores/cid.d.ts +2 -0
- package/dist/lib/stores/cid.d.ts.map +1 -0
- package/dist/lib/stores/cid.js +8 -0
- package/dist/lib/stores/cid.js.map +7 -0
- package/dist/lib/stores/sqlite/bun-sqlite.d.ts +1 -0
- package/dist/lib/stores/sqlite/bun-sqlite.d.ts.map +1 -0
- package/dist/lib/stores/sqlite/bun-sqlite.js +1 -0
- package/dist/lib/stores/sqlite/bun-sqlite.js.map +7 -0
- package/dist/lib/stores/sqlite/node-sqlite.d.ts +3 -0
- package/dist/lib/stores/sqlite/node-sqlite.d.ts.map +1 -0
- package/dist/lib/stores/sqlite/node-sqlite.js +14 -0
- package/dist/lib/stores/sqlite/node-sqlite.js.map +7 -0
- package/dist/lib/stores/sqlite/sqlite.d.ts +35 -0
- package/dist/lib/stores/sqlite/sqlite.d.ts.map +1 -0
- package/dist/lib/stores/sqlite/sqlite.js +223 -0
- package/dist/lib/stores/sqlite/sqlite.js.map +7 -0
- package/dist/lib/stores/sqlite/sqlite.sql.d.ts +3 -0
- package/dist/lib/stores/sqlite/sqlite.sql.d.ts.map +1 -0
- package/dist/lib/stores/sqlite/sqlite.sql.js +54 -0
- package/dist/lib/stores/sqlite/sqlite.sql.js.map +7 -0
- package/dist/lib/stores/store.d.ts +28 -0
- package/dist/lib/stores/store.d.ts.map +1 -0
- package/dist/lib/stores/store.js +1 -0
- package/dist/lib/stores/store.js.map +7 -0
- package/package.json +47 -0
|
@@ -0,0 +1,171 @@
|
|
|
1
|
+
import fg from "fast-glob";
|
|
2
|
+
import { opendir, readFile, stat } from "node:fs/promises";
|
|
3
|
+
import { join } from "node:path";
|
|
4
|
+
function repo(dir, extensions, ingestWhen) {
|
|
5
|
+
const sourceId = `repo:${dir}`;
|
|
6
|
+
return {
|
|
7
|
+
sourceId,
|
|
8
|
+
ingestWhen,
|
|
9
|
+
sources: async function* () {
|
|
10
|
+
const paths = await collectFiles(dir, extensions);
|
|
11
|
+
for await (const path of paths) {
|
|
12
|
+
const maxSize = 3 * 1024;
|
|
13
|
+
const st = await stat(path);
|
|
14
|
+
if (st.size > maxSize) {
|
|
15
|
+
continue;
|
|
16
|
+
}
|
|
17
|
+
yield {
|
|
18
|
+
id: path,
|
|
19
|
+
metadata: { repo: dir },
|
|
20
|
+
content: () => readFile(path, "utf8").catch(() => "")
|
|
21
|
+
};
|
|
22
|
+
}
|
|
23
|
+
}
|
|
24
|
+
};
|
|
25
|
+
}
|
|
26
|
+
async function* findAllGitRepos(root) {
|
|
27
|
+
const stack = [root];
|
|
28
|
+
const skip = [
|
|
29
|
+
"node_modules",
|
|
30
|
+
"Library",
|
|
31
|
+
"Applications",
|
|
32
|
+
"Pictures",
|
|
33
|
+
"Movies",
|
|
34
|
+
"Music",
|
|
35
|
+
"Downloads",
|
|
36
|
+
".cache",
|
|
37
|
+
".npm",
|
|
38
|
+
".pnpm",
|
|
39
|
+
"development"
|
|
40
|
+
];
|
|
41
|
+
while (stack.length) {
|
|
42
|
+
const folder = stack.pop();
|
|
43
|
+
const isGitRepo = await stat(join(folder, ".git")).then((st) => st.isDirectory() || st.isFile()).catch(() => false);
|
|
44
|
+
if (isGitRepo) {
|
|
45
|
+
yield folder;
|
|
46
|
+
continue;
|
|
47
|
+
}
|
|
48
|
+
const dir = await opendir(folder);
|
|
49
|
+
for await (const dirent of dir) {
|
|
50
|
+
if (dirent.isDirectory() && !dirent.isSymbolicLink() && !skip.includes(dirent.name) && !dirent.name.startsWith(".")) {
|
|
51
|
+
stack.push(join(folder, dirent.name));
|
|
52
|
+
}
|
|
53
|
+
}
|
|
54
|
+
}
|
|
55
|
+
}
|
|
56
|
+
function detectRepoTooling(repo2) {
|
|
57
|
+
const tools = [];
|
|
58
|
+
return tools;
|
|
59
|
+
}
|
|
60
|
+
async function gitignore(gitignoreFile) {
|
|
61
|
+
const content = await readFile(gitignoreFile, "utf8").catch(() => "");
|
|
62
|
+
if (!content) {
|
|
63
|
+
return [];
|
|
64
|
+
}
|
|
65
|
+
const patterns = content.split("\n").map((line) => line.trim()).filter((line) => line && !line.startsWith("#"));
|
|
66
|
+
return patterns;
|
|
67
|
+
}
|
|
68
|
+
async function ignorePatterns(repo2) {
|
|
69
|
+
return [
|
|
70
|
+
// Inherit repo-specific ignore patterns
|
|
71
|
+
...await gitignore(join(repo2, ".gitignore")),
|
|
72
|
+
// Package managers & dependency dirs
|
|
73
|
+
"node_modules/**",
|
|
74
|
+
"**/node_modules/**",
|
|
75
|
+
"**/.pnpm/**",
|
|
76
|
+
"**/.npm/**",
|
|
77
|
+
"**/.yarn/**",
|
|
78
|
+
"**/vendor/**",
|
|
79
|
+
// PHP / Go modules (when vendored)
|
|
80
|
+
"**/3rdparty/**",
|
|
81
|
+
// Version control + VCS metadata
|
|
82
|
+
"**/.git/**",
|
|
83
|
+
"**/.svn/**",
|
|
84
|
+
"**/.hg/**",
|
|
85
|
+
// OS / system junk
|
|
86
|
+
"**/.DS_Store",
|
|
87
|
+
"**/Thumbs.db",
|
|
88
|
+
"**/Library/**",
|
|
89
|
+
"**/Applications/**",
|
|
90
|
+
"**/Pictures/**",
|
|
91
|
+
"**/Movies/**",
|
|
92
|
+
"**/Music/**",
|
|
93
|
+
"**/Downloads/**",
|
|
94
|
+
"**/.cache/**",
|
|
95
|
+
// Environment / secrets (explicit)
|
|
96
|
+
"**/.env",
|
|
97
|
+
"**/.env.*",
|
|
98
|
+
// Lockfiles & generated dependency state
|
|
99
|
+
"**/*.lock",
|
|
100
|
+
"**/yarn.lock",
|
|
101
|
+
"**/package-lock.json",
|
|
102
|
+
"**/pnpm-lock.yaml",
|
|
103
|
+
// Build / compilation outputs
|
|
104
|
+
"**/dist/**",
|
|
105
|
+
"**/debug/**",
|
|
106
|
+
"**/build/**",
|
|
107
|
+
"**/out/**",
|
|
108
|
+
"**/target/**",
|
|
109
|
+
// Rust / JVM
|
|
110
|
+
"**/bin/**",
|
|
111
|
+
"**/obj/**",
|
|
112
|
+
"**/classes/**",
|
|
113
|
+
// Framework / tool specific build artifacts
|
|
114
|
+
"**/.next/**",
|
|
115
|
+
"**/.vercel/**",
|
|
116
|
+
"**/.turbo/**",
|
|
117
|
+
"**/.docusaurus/**",
|
|
118
|
+
"**/.vite/**",
|
|
119
|
+
"**/.parcel-cache/**",
|
|
120
|
+
"**/.rollup.cache/**",
|
|
121
|
+
"**/.vuepress/**",
|
|
122
|
+
"cdk.out/**",
|
|
123
|
+
// Infra & deployment tooling
|
|
124
|
+
"**/.serverless/**",
|
|
125
|
+
"**/.terraform/**",
|
|
126
|
+
"**/.terragrunt-cache/**",
|
|
127
|
+
"**/.pulumi/**",
|
|
128
|
+
// Coverage & testing caches
|
|
129
|
+
"**/coverage/**",
|
|
130
|
+
"**/.nyc_output/**",
|
|
131
|
+
"**/jest-cache/**",
|
|
132
|
+
"**/.pytest_cache/**",
|
|
133
|
+
// Language / tooling caches
|
|
134
|
+
"**/__pycache__/**",
|
|
135
|
+
"**/.mypy_cache/**",
|
|
136
|
+
"**/.tox/**",
|
|
137
|
+
"**/.gradle/**",
|
|
138
|
+
"**/.mvn/**",
|
|
139
|
+
"**/.eslintcache",
|
|
140
|
+
"**/.stylelintcache",
|
|
141
|
+
// IDE / editor configs + history (we don't want to embed these)
|
|
142
|
+
"**/.idea/**",
|
|
143
|
+
"**/.vscode/**",
|
|
144
|
+
"**/.fleet/**",
|
|
145
|
+
"**/.history/**",
|
|
146
|
+
// Virtual environments
|
|
147
|
+
"**/.venv/**",
|
|
148
|
+
"**/venv/**"
|
|
149
|
+
];
|
|
150
|
+
}
|
|
151
|
+
async function collectFiles(repo2, extensions) {
|
|
152
|
+
const exts = extensions.map((ext) => ext.replace(/^\./, ""));
|
|
153
|
+
return fg.stream(
|
|
154
|
+
extensions.length > 1 ? `**/*.{${exts.join(",")}}` : `**/*.${exts[0]}`,
|
|
155
|
+
{
|
|
156
|
+
dot: false,
|
|
157
|
+
onlyFiles: true,
|
|
158
|
+
unique: true,
|
|
159
|
+
absolute: true,
|
|
160
|
+
cwd: repo2,
|
|
161
|
+
ignore: await ignorePatterns(repo2)
|
|
162
|
+
}
|
|
163
|
+
);
|
|
164
|
+
}
|
|
165
|
+
export {
|
|
166
|
+
collectFiles,
|
|
167
|
+
findAllGitRepos,
|
|
168
|
+
ignorePatterns,
|
|
169
|
+
repo
|
|
170
|
+
};
|
|
171
|
+
//# sourceMappingURL=repo.js.map
|
|
@@ -0,0 +1,7 @@
|
|
|
1
|
+
{
|
|
2
|
+
"version": 3,
|
|
3
|
+
"sources": ["../../../src/lib/connectors/repo.ts"],
|
|
4
|
+
"sourcesContent": ["import fg from 'fast-glob';\nimport { opendir, readFile, stat } from 'node:fs/promises';\nimport { join } from 'node:path';\n\nimport type { Connector } from './connector.js';\n\nexport function repo(\n dir: string,\n extensions: string[],\n ingestWhen: Connector['ingestWhen'],\n): Connector {\n const sourceId = `repo:${dir}`;\n return {\n sourceId,\n ingestWhen,\n sources: async function* () {\n const paths = await collectFiles(dir, extensions);\n for await (const path of paths) {\n const maxSize = 3 * 1024; // 3KB\n const st = await stat(path);\n if (st.size > maxSize) {\n continue;\n }\n yield {\n id: path,\n metadata: { repo: dir },\n content: () => readFile(path, 'utf8').catch(() => ''),\n };\n }\n },\n };\n}\n\nexport async function* findAllGitRepos(root: string) {\n const stack = [root];\n const skip = [\n 'node_modules',\n 'Library',\n 'Applications',\n 'Pictures',\n 'Movies',\n 'Music',\n 'Downloads',\n '.cache',\n '.npm',\n '.pnpm',\n 'development',\n ];\n while (stack.length) {\n const folder = stack.pop()!;\n const isGitRepo = await stat(join(folder, '.git'))\n .then((st) => st.isDirectory() || st.isFile())\n .catch(() => false);\n if (isGitRepo) {\n yield folder;\n continue;\n }\n const dir = await opendir(folder);\n for await (const dirent of dir) {\n if (\n dirent.isDirectory() &&\n !dirent.isSymbolicLink() &&\n !skip.includes(dirent.name) &&\n !dirent.name.startsWith('.')\n ) {\n stack.push(join(folder, dirent.name));\n }\n }\n }\n}\n\nfunction detectRepoTooling(repo: string) {\n const tools: string[] = [];\n return tools;\n}\n\nasync function gitignore(gitignoreFile: string) {\n const content = await readFile(gitignoreFile, 'utf8').catch(() => '');\n if (!content) {\n return [];\n }\n const patterns = content\n .split('\\n')\n .map((line) => line.trim())\n .filter((line) => line && !line.startsWith('#'));\n return patterns;\n}\n\nexport async function ignorePatterns(repo: string) {\n return [\n // Inherit repo-specific ignore patterns\n ...(await gitignore(join(repo, '.gitignore'))),\n\n // Package managers & dependency dirs\n 'node_modules/**',\n '**/node_modules/**',\n '**/.pnpm/**',\n '**/.npm/**',\n '**/.yarn/**',\n '**/vendor/**', // PHP / Go modules (when vendored)\n '**/3rdparty/**',\n\n // Version control + VCS metadata\n '**/.git/**',\n '**/.svn/**',\n '**/.hg/**',\n\n // OS / system junk\n '**/.DS_Store',\n '**/Thumbs.db',\n '**/Library/**',\n '**/Applications/**',\n '**/Pictures/**',\n '**/Movies/**',\n '**/Music/**',\n '**/Downloads/**',\n '**/.cache/**',\n\n // Environment / secrets (explicit)\n '**/.env',\n '**/.env.*',\n\n // Lockfiles & generated dependency state\n '**/*.lock',\n '**/yarn.lock',\n '**/package-lock.json',\n '**/pnpm-lock.yaml',\n\n // Build / compilation outputs\n '**/dist/**',\n '**/debug/**',\n '**/build/**',\n '**/out/**',\n '**/target/**', // Rust / JVM\n '**/bin/**',\n '**/obj/**',\n '**/classes/**',\n\n // Framework / tool specific build artifacts\n '**/.next/**',\n '**/.vercel/**',\n '**/.turbo/**',\n '**/.docusaurus/**',\n '**/.vite/**',\n '**/.parcel-cache/**',\n '**/.rollup.cache/**',\n '**/.vuepress/**',\n 'cdk.out/**',\n\n // Infra & deployment tooling\n '**/.serverless/**',\n '**/.terraform/**',\n '**/.terragrunt-cache/**',\n '**/.pulumi/**',\n\n // Coverage & testing caches\n '**/coverage/**',\n '**/.nyc_output/**',\n '**/jest-cache/**',\n '**/.pytest_cache/**',\n\n // Language / tooling caches\n '**/__pycache__/**',\n '**/.mypy_cache/**',\n '**/.tox/**',\n '**/.gradle/**',\n '**/.mvn/**',\n '**/.eslintcache',\n '**/.stylelintcache',\n\n // IDE / editor configs + history (we don't want to embed these)\n '**/.idea/**',\n '**/.vscode/**',\n '**/.fleet/**',\n '**/.history/**',\n\n // Virtual environments\n '**/.venv/**',\n '**/venv/**',\n ];\n}\n\nexport async function collectFiles(\n repo: string,\n extensions: string[],\n): Promise<AsyncIterable<string>> {\n const exts = extensions.map((ext) => ext.replace(/^\\./, ''));\n return fg.stream(\n extensions.length > 1 ? `**/*.{${exts.join(',')}}` : `**/*.${exts[0]}`,\n {\n dot: false,\n onlyFiles: true,\n unique: true,\n absolute: true,\n cwd: repo,\n ignore: await ignorePatterns(repo),\n },\n ) as AsyncIterable<string>;\n}\n"],
|
|
5
|
+
"mappings": "AAAA,OAAO,QAAQ;AACf,SAAS,SAAS,UAAU,YAAY;AACxC,SAAS,YAAY;AAId,SAAS,KACd,KACA,YACA,YACW;AACX,QAAM,WAAW,QAAQ,GAAG;AAC5B,SAAO;AAAA,IACL;AAAA,IACA;AAAA,IACA,SAAS,mBAAmB;AAC1B,YAAM,QAAQ,MAAM,aAAa,KAAK,UAAU;AAChD,uBAAiB,QAAQ,OAAO;AAC9B,cAAM,UAAU,IAAI;AACpB,cAAM,KAAK,MAAM,KAAK,IAAI;AAC1B,YAAI,GAAG,OAAO,SAAS;AACrB;AAAA,QACF;AACA,cAAM;AAAA,UACJ,IAAI;AAAA,UACJ,UAAU,EAAE,MAAM,IAAI;AAAA,UACtB,SAAS,MAAM,SAAS,MAAM,MAAM,EAAE,MAAM,MAAM,EAAE;AAAA,QACtD;AAAA,MACF;AAAA,IACF;AAAA,EACF;AACF;AAEA,gBAAuB,gBAAgB,MAAc;AACnD,QAAM,QAAQ,CAAC,IAAI;AACnB,QAAM,OAAO;AAAA,IACX;AAAA,IACA;AAAA,IACA;AAAA,IACA;AAAA,IACA;AAAA,IACA;AAAA,IACA;AAAA,IACA;AAAA,IACA;AAAA,IACA;AAAA,IACA;AAAA,EACF;AACA,SAAO,MAAM,QAAQ;AACnB,UAAM,SAAS,MAAM,IAAI;AACzB,UAAM,YAAY,MAAM,KAAK,KAAK,QAAQ,MAAM,CAAC,EAC9C,KAAK,CAAC,OAAO,GAAG,YAAY,KAAK,GAAG,OAAO,CAAC,EAC5C,MAAM,MAAM,KAAK;AACpB,QAAI,WAAW;AACb,YAAM;AACN;AAAA,IACF;AACA,UAAM,MAAM,MAAM,QAAQ,MAAM;AAChC,qBAAiB,UAAU,KAAK;AAC9B,UACE,OAAO,YAAY,KACnB,CAAC,OAAO,eAAe,KACvB,CAAC,KAAK,SAAS,OAAO,IAAI,KAC1B,CAAC,OAAO,KAAK,WAAW,GAAG,GAC3B;AACA,cAAM,KAAK,KAAK,QAAQ,OAAO,IAAI,CAAC;AAAA,MACtC;AAAA,IACF;AAAA,EACF;AACF;AAEA,SAAS,kBAAkBA,OAAc;AACvC,QAAM,QAAkB,CAAC;AACzB,SAAO;AACT;AAEA,eAAe,UAAU,eAAuB;AAC9C,QAAM,UAAU,MAAM,SAAS,eAAe,MAAM,EAAE,MAAM,MAAM,EAAE;AACpE,MAAI,CAAC,SAAS;AACZ,WAAO,CAAC;AAAA,EACV;AACA,QAAM,WAAW,QACd,MAAM,IAAI,EACV,IAAI,CAAC,SAAS,KAAK,KAAK,CAAC,EACzB,OAAO,CAAC,SAAS,QAAQ,CAAC,KAAK,WAAW,GAAG,CAAC;AACjD,SAAO;AACT;AAEA,eAAsB,eAAeA,OAAc;AACjD,SAAO;AAAA;AAAA,IAEL,GAAI,MAAM,UAAU,KAAKA,OAAM,YAAY,CAAC;AAAA;AAAA,IAG5C;AAAA,IACA;AAAA,IACA;AAAA,IACA;AAAA,IACA;AAAA,IACA;AAAA;AAAA,IACA;AAAA;AAAA,IAGA;AAAA,IACA;AAAA,IACA;AAAA;AAAA,IAGA;AAAA,IACA;AAAA,IACA;AAAA,IACA;AAAA,IACA;AAAA,IACA;AAAA,IACA;AAAA,IACA;AAAA,IACA;AAAA;AAAA,IAGA;AAAA,IACA;AAAA;AAAA,IAGA;AAAA,IACA;AAAA,IACA;AAAA,IACA;AAAA;AAAA,IAGA;AAAA,IACA;AAAA,IACA;AAAA,IACA;AAAA,IACA;AAAA;AAAA,IACA;AAAA,IACA;AAAA,IACA;AAAA;AAAA,IAGA;AAAA,IACA;AAAA,IACA;AAAA,IACA;AAAA,IACA;AAAA,IACA;AAAA,IACA;AAAA,IACA;AAAA,IACA;AAAA;AAAA,IAGA;AAAA,IACA;AAAA,IACA;AAAA,IACA;AAAA;AAAA,IAGA;AAAA,IACA;AAAA,IACA;AAAA,IACA;AAAA;AAAA,IAGA;AAAA,IACA;AAAA,IACA;AAAA,IACA;AAAA,IACA;AAAA,IACA;AAAA,IACA;AAAA;AAAA,IAGA;AAAA,IACA;AAAA,IACA;AAAA,IACA;AAAA;AAAA,IAGA;AAAA,IACA;AAAA,EACF;AACF;AAEA,eAAsB,aACpBA,OACA,YACgC;AAChC,QAAM,OAAO,WAAW,IAAI,CAAC,QAAQ,IAAI,QAAQ,OAAO,EAAE,CAAC;AAC3D,SAAO,GAAG;AAAA,IACR,WAAW,SAAS,IAAI,SAAS,KAAK,KAAK,GAAG,CAAC,MAAM,QAAQ,KAAK,CAAC,CAAC;AAAA,IACpE;AAAA,MACE,KAAK;AAAA,MACL,WAAW;AAAA,MACX,QAAQ;AAAA,MACR,UAAU;AAAA,MACV,KAAKA;AAAA,MACL,QAAQ,MAAM,eAAeA,KAAI;AAAA,IACnC;AAAA,EACF;AACF;",
|
|
6
|
+
"names": ["repo"]
|
|
7
|
+
}
|
|
@@ -0,0 +1,12 @@
|
|
|
1
|
+
export declare function rss(feedUrl: string, options?: {
|
|
2
|
+
maxItems?: number;
|
|
3
|
+
fetchFullArticles?: boolean;
|
|
4
|
+
}): {
|
|
5
|
+
sourceId: string;
|
|
6
|
+
instructions: string;
|
|
7
|
+
sources: () => AsyncGenerator<{
|
|
8
|
+
id: any;
|
|
9
|
+
content: () => Promise<string>;
|
|
10
|
+
}, void, unknown>;
|
|
11
|
+
};
|
|
12
|
+
//# sourceMappingURL=rss.d.ts.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"rss.d.ts","sourceRoot":"","sources":["../../../src/lib/connectors/rss.ts"],"names":[],"mappings":"AAsGA,wBAAgB,GAAG,CACjB,OAAO,EAAE,MAAM,EACf,OAAO,GAAE;IACP,QAAQ,CAAC,EAAE,MAAM,CAAC;IAClB,iBAAiB,CAAC,EAAE,OAAO,CAAC;CACxB;;;;;;;EA0DP"}
|
|
@@ -0,0 +1,136 @@
|
|
|
1
|
+
import { Readability } from "@mozilla/readability";
|
|
2
|
+
import Parser from "rss-parser";
|
|
3
|
+
const rssParser = new Parser({
|
|
4
|
+
customFields: {
|
|
5
|
+
feed: ["copyright", "language", "managingEditor", "webMaster"],
|
|
6
|
+
item: ["category", "creator", "enclosure", "guid"]
|
|
7
|
+
}
|
|
8
|
+
});
|
|
9
|
+
async function parseRSSFeed(feedUrl) {
|
|
10
|
+
try {
|
|
11
|
+
console.log(`Fetching RSS feed: ${feedUrl}`);
|
|
12
|
+
const feed = await rssParser.parseURL(feedUrl);
|
|
13
|
+
return {
|
|
14
|
+
title: feed.title || "",
|
|
15
|
+
description: feed.description || "",
|
|
16
|
+
link: feed.link || "",
|
|
17
|
+
language: feed.language || "en",
|
|
18
|
+
lastBuildDate: feed.lastBuildDate || (/* @__PURE__ */ new Date()).toISOString(),
|
|
19
|
+
items: feed.items.map((item) => ({
|
|
20
|
+
title: item.title || "",
|
|
21
|
+
description: item.content || item.summary || item.contentSnippet || "",
|
|
22
|
+
link: item.link || "",
|
|
23
|
+
pubDate: item.pubDate || item.isoDate || "",
|
|
24
|
+
author: item.creator || item.author || "",
|
|
25
|
+
categories: Array.isArray(item.categories) ? item.categories : item.category ? [item.category] : [],
|
|
26
|
+
guid: item.guid || item.guid || "",
|
|
27
|
+
contentEncoded: item["content:encoded"] || item.content || ""
|
|
28
|
+
}))
|
|
29
|
+
};
|
|
30
|
+
} catch (error) {
|
|
31
|
+
console.error(`Failed to parse RSS feed ${feedUrl}:`, error);
|
|
32
|
+
throw new Error(
|
|
33
|
+
`RSS parsing failed: ${error instanceof Error ? error.message : "Unknown error"}`
|
|
34
|
+
);
|
|
35
|
+
}
|
|
36
|
+
}
|
|
37
|
+
async function fetchArticleContent(url) {
|
|
38
|
+
try {
|
|
39
|
+
console.log(`Fetching full article content from: ${url}`);
|
|
40
|
+
const response = await fetch(url, {
|
|
41
|
+
headers: {
|
|
42
|
+
"User-Agent": "Mozilla/5.0 (compatible; RSS-RAG-Bot/1.0)",
|
|
43
|
+
Accept: "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8",
|
|
44
|
+
"Accept-Language": "en-US,en;q=0.5",
|
|
45
|
+
"Accept-Encoding": "gzip, deflate",
|
|
46
|
+
Connection: "keep-alive"
|
|
47
|
+
},
|
|
48
|
+
signal: AbortSignal.timeout(1e4)
|
|
49
|
+
// 10 second timeout
|
|
50
|
+
});
|
|
51
|
+
if (!response.ok) {
|
|
52
|
+
throw new Error(`HTTP ${response.status}: ${response.statusText}`);
|
|
53
|
+
}
|
|
54
|
+
const html = await response.text();
|
|
55
|
+
const { JSDOM } = await import("jsdom");
|
|
56
|
+
const dom = new JSDOM(html, { url });
|
|
57
|
+
const document = dom.window.document;
|
|
58
|
+
const reader = new Readability(document);
|
|
59
|
+
const article = reader.parse();
|
|
60
|
+
if (!article) {
|
|
61
|
+
throw new Error("Readability failed to extract article content");
|
|
62
|
+
}
|
|
63
|
+
const fullContent = `${article.title ? article.title + "\n\n" : ""}${article.textContent || article.content}`;
|
|
64
|
+
if (fullContent.length < 200) {
|
|
65
|
+
throw new Error(
|
|
66
|
+
`Extracted content too short (${fullContent.length} chars) - likely failed to find main content`
|
|
67
|
+
);
|
|
68
|
+
}
|
|
69
|
+
console.log(
|
|
70
|
+
`\u2713 Successfully extracted article: "${article.title}" (${fullContent.length} chars, ~${Math.round(fullContent.split(" ").length)} words)`
|
|
71
|
+
);
|
|
72
|
+
return fullContent;
|
|
73
|
+
} catch (error) {
|
|
74
|
+
console.warn(
|
|
75
|
+
`Failed to fetch article content from ${url}:`,
|
|
76
|
+
error instanceof Error ? error.message : error
|
|
77
|
+
);
|
|
78
|
+
return "";
|
|
79
|
+
}
|
|
80
|
+
}
|
|
81
|
+
function rss(feedUrl, options = {}) {
|
|
82
|
+
const sourceId = `rss:${feedUrl}`;
|
|
83
|
+
const { maxItems = 50, fetchFullArticles = false } = options;
|
|
84
|
+
return {
|
|
85
|
+
sourceId,
|
|
86
|
+
instructions: `You answer questions about articles and content from the RSS feed: ${feedUrl}.
|
|
87
|
+
Always cite the article title and link when referencing specific content.
|
|
88
|
+
The feed contains recent articles, blog posts, and news items.
|
|
89
|
+
When referencing content, include the publication date and author when available.
|
|
90
|
+
${fetchFullArticles ? "Full article content has been extracted from the original links for comprehensive analysis." : "Content includes RSS summaries and descriptions."}`,
|
|
91
|
+
sources: async function* () {
|
|
92
|
+
const feed = await parseRSSFeed(feedUrl);
|
|
93
|
+
const feedSummary = `RSS Feed: ${feed.title}
|
|
94
|
+
Description: ${feed.description}
|
|
95
|
+
Website: ${feed.link}
|
|
96
|
+
Language: ${feed.language}
|
|
97
|
+
Last Updated: ${feed.lastBuildDate}
|
|
98
|
+
Total Items: ${feed.items.length}
|
|
99
|
+
|
|
100
|
+
This feed provides: ${feed.description}`;
|
|
101
|
+
yield {
|
|
102
|
+
id: "feed-info",
|
|
103
|
+
content: async () => feedSummary
|
|
104
|
+
};
|
|
105
|
+
const itemsToProcess = feed.items.slice(0, maxItems);
|
|
106
|
+
for (const item of itemsToProcess) {
|
|
107
|
+
const documentId = item.guid || item.link || `${item.title}-${item.pubDate}`;
|
|
108
|
+
yield {
|
|
109
|
+
id: documentId,
|
|
110
|
+
content: async () => {
|
|
111
|
+
let articleContent = item.contentEncoded || item.description;
|
|
112
|
+
if (fetchFullArticles && item.link) {
|
|
113
|
+
const fullContent = await fetchArticleContent(item.link);
|
|
114
|
+
if (fullContent && fullContent.length > articleContent.length) {
|
|
115
|
+
articleContent = fullContent;
|
|
116
|
+
}
|
|
117
|
+
}
|
|
118
|
+
return `Title: ${item.title}
|
|
119
|
+
Author: ${item.author}
|
|
120
|
+
Published: ${item.pubDate}
|
|
121
|
+
Categories: ${item.categories.join(", ")}
|
|
122
|
+
Link: ${item.link}
|
|
123
|
+
${fetchFullArticles ? "Full Article Content:" : "Content:"}
|
|
124
|
+
${articleContent}
|
|
125
|
+
|
|
126
|
+
Summary: ${item.title} - ${item.description}`;
|
|
127
|
+
}
|
|
128
|
+
};
|
|
129
|
+
}
|
|
130
|
+
}
|
|
131
|
+
};
|
|
132
|
+
}
|
|
133
|
+
export {
|
|
134
|
+
rss
|
|
135
|
+
};
|
|
136
|
+
//# sourceMappingURL=rss.js.map
|
|
@@ -0,0 +1,7 @@
|
|
|
1
|
+
{
|
|
2
|
+
"version": 3,
|
|
3
|
+
"sources": ["../../../src/lib/connectors/rss.ts"],
|
|
4
|
+
"sourcesContent": ["import { Readability } from '@mozilla/readability';\nimport Parser from 'rss-parser';\n\nconst rssParser = new Parser({\n customFields: {\n feed: ['copyright', 'language', 'managingEditor', 'webMaster'],\n item: ['category', 'creator', 'enclosure', 'guid'],\n },\n});\n\nasync function parseRSSFeed(feedUrl: string) {\n try {\n console.log(`Fetching RSS feed: ${feedUrl}`);\n const feed = await rssParser.parseURL(feedUrl);\n\n return {\n title: feed.title || '',\n description: feed.description || '',\n link: feed.link || '',\n language: feed.language || 'en',\n lastBuildDate: (feed as any).lastBuildDate || new Date().toISOString(),\n items: feed.items.map((item) => ({\n title: item.title || '',\n description: item.content || item.summary || item.contentSnippet || '',\n link: item.link || '',\n pubDate: item.pubDate || item.isoDate || '',\n author: item.creator || (item as any).author || '',\n categories: Array.isArray(item.categories)\n ? item.categories\n : item.category\n ? [item.category]\n : [],\n guid: item.guid || item.guid || '',\n contentEncoded:\n (item as any)['content:encoded'] || (item as any).content || '',\n })),\n };\n } catch (error) {\n console.error(`Failed to parse RSS feed ${feedUrl}:`, error);\n throw new Error(\n `RSS parsing failed: ${error instanceof Error ? error.message : 'Unknown error'}`,\n );\n }\n}\n\nasync function fetchArticleContent(url: string): Promise<string> {\n try {\n console.log(`Fetching full article content from: ${url}`);\n\n const response = await fetch(url, {\n headers: {\n 'User-Agent': 'Mozilla/5.0 (compatible; RSS-RAG-Bot/1.0)',\n Accept:\n 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',\n 'Accept-Language': 'en-US,en;q=0.5',\n 'Accept-Encoding': 'gzip, deflate',\n Connection: 'keep-alive',\n },\n signal: AbortSignal.timeout(10000), // 10 second timeout\n });\n\n if (!response.ok) {\n throw new Error(`HTTP ${response.status}: ${response.statusText}`);\n }\n\n const html = await response.text();\n\n const { JSDOM } = await import('jsdom');\n const dom = new JSDOM(html, { url });\n const document = dom.window.document;\n\n // Use Mozilla's Readability to extract the main content\n const reader = new Readability(document);\n const article = reader.parse();\n\n if (!article) {\n throw new Error('Readability failed to extract article content');\n }\n\n // Combine title and content for comprehensive text\n const fullContent = `${article.title ? article.title + '\\n\\n' : ''}${article.textContent || article.content}`;\n\n // Ensure we have meaningful content (at least 200 characters for full articles)\n if (fullContent.length < 200) {\n throw new Error(\n `Extracted content too short (${fullContent.length} chars) - likely failed to find main content`,\n );\n }\n\n console.log(\n `\u2713 Successfully extracted article: \"${article.title}\" (${fullContent.length} chars, ~${Math.round(fullContent.split(' ').length)} words)`,\n );\n\n return fullContent;\n } catch (error) {\n console.warn(\n `Failed to fetch article content from ${url}:`,\n error instanceof Error ? error.message : error,\n );\n return ''; // Return empty string on failure rather than throwing\n }\n}\nexport function rss(\n feedUrl: string,\n options: {\n maxItems?: number;\n fetchFullArticles?: boolean;\n } = {},\n) {\n const sourceId = `rss:${feedUrl}`;\n const { maxItems = 50, fetchFullArticles = false } = options;\n\n return {\n sourceId,\n instructions: `You answer questions about articles and content from the RSS feed: ${feedUrl}.\n Always cite the article title and link when referencing specific content.\n The feed contains recent articles, blog posts, and news items.\n When referencing content, include the publication date and author when available.\n ${fetchFullArticles ? 'Full article content has been extracted from the original links for comprehensive analysis.' : 'Content includes RSS summaries and descriptions.'}`,\n sources: async function* () {\n const feed = await parseRSSFeed(feedUrl);\n // Add feed summary source\n const feedSummary = `RSS Feed: ${feed.title}\nDescription: ${feed.description}\nWebsite: ${feed.link}\nLanguage: ${feed.language}\nLast Updated: ${feed.lastBuildDate}\nTotal Items: ${feed.items.length}\n\nThis feed provides: ${feed.description}`;\n yield {\n id: 'feed-info',\n content: async () => feedSummary,\n };\n\n // Individual article sources (limit to maxItems)\n const itemsToProcess = feed.items.slice(0, maxItems);\n for (const item of itemsToProcess) {\n const documentId =\n item.guid || item.link || `${item.title}-${item.pubDate}`;\n yield {\n id: documentId,\n content: async () => {\n // Try full article fetch if enabled, fallback to RSS content\n let articleContent = item.contentEncoded || item.description;\n if (fetchFullArticles && item.link) {\n const fullContent = await fetchArticleContent(item.link);\n if (fullContent && fullContent.length > articleContent.length) {\n articleContent = fullContent;\n }\n }\n return `Title: ${item.title}\nAuthor: ${item.author}\nPublished: ${item.pubDate}\nCategories: ${item.categories.join(', ')}\nLink: ${item.link}\n${fetchFullArticles ? 'Full Article Content:' : 'Content:'}\n${articleContent}\n\nSummary: ${item.title} - ${item.description}`;\n },\n };\n }\n },\n };\n}\n"],
|
|
5
|
+
"mappings": "AAAA,SAAS,mBAAmB;AAC5B,OAAO,YAAY;AAEnB,MAAM,YAAY,IAAI,OAAO;AAAA,EAC3B,cAAc;AAAA,IACZ,MAAM,CAAC,aAAa,YAAY,kBAAkB,WAAW;AAAA,IAC7D,MAAM,CAAC,YAAY,WAAW,aAAa,MAAM;AAAA,EACnD;AACF,CAAC;AAED,eAAe,aAAa,SAAiB;AAC3C,MAAI;AACF,YAAQ,IAAI,sBAAsB,OAAO,EAAE;AAC3C,UAAM,OAAO,MAAM,UAAU,SAAS,OAAO;AAE7C,WAAO;AAAA,MACL,OAAO,KAAK,SAAS;AAAA,MACrB,aAAa,KAAK,eAAe;AAAA,MACjC,MAAM,KAAK,QAAQ;AAAA,MACnB,UAAU,KAAK,YAAY;AAAA,MAC3B,eAAgB,KAAa,kBAAiB,oBAAI,KAAK,GAAE,YAAY;AAAA,MACrE,OAAO,KAAK,MAAM,IAAI,CAAC,UAAU;AAAA,QAC/B,OAAO,KAAK,SAAS;AAAA,QACrB,aAAa,KAAK,WAAW,KAAK,WAAW,KAAK,kBAAkB;AAAA,QACpE,MAAM,KAAK,QAAQ;AAAA,QACnB,SAAS,KAAK,WAAW,KAAK,WAAW;AAAA,QACzC,QAAQ,KAAK,WAAY,KAAa,UAAU;AAAA,QAChD,YAAY,MAAM,QAAQ,KAAK,UAAU,IACrC,KAAK,aACL,KAAK,WACH,CAAC,KAAK,QAAQ,IACd,CAAC;AAAA,QACP,MAAM,KAAK,QAAQ,KAAK,QAAQ;AAAA,QAChC,gBACG,KAAa,iBAAiB,KAAM,KAAa,WAAW;AAAA,MACjE,EAAE;AAAA,IACJ;AAAA,EACF,SAAS,OAAO;AACd,YAAQ,MAAM,4BAA4B,OAAO,KAAK,KAAK;AAC3D,UAAM,IAAI;AAAA,MACR,uBAAuB,iBAAiB,QAAQ,MAAM,UAAU,eAAe;AAAA,IACjF;AAAA,EACF;AACF;AAEA,eAAe,oBAAoB,KAA8B;AAC/D,MAAI;AACF,YAAQ,IAAI,uCAAuC,GAAG,EAAE;AAExD,UAAM,WAAW,MAAM,MAAM,KAAK;AAAA,MAChC,SAAS;AAAA,QACP,cAAc;AAAA,QACd,QACE;AAAA,QACF,mBAAmB;AAAA,QACnB,mBAAmB;AAAA,QACnB,YAAY;AAAA,MACd;AAAA,MACA,QAAQ,YAAY,QAAQ,GAAK;AAAA;AAAA,IACnC,CAAC;AAED,QAAI,CAAC,SAAS,IAAI;AAChB,YAAM,IAAI,MAAM,QAAQ,SAAS,MAAM,KAAK,SAAS,UAAU,EAAE;AAAA,IACnE;AAEA,UAAM,OAAO,MAAM,SAAS,KAAK;AAEjC,UAAM,EAAE,MAAM,IAAI,MAAM,OAAO,OAAO;AACtC,UAAM,MAAM,IAAI,MAAM,MAAM,EAAE,IAAI,CAAC;AACnC,UAAM,WAAW,IAAI,OAAO;AAG5B,UAAM,SAAS,IAAI,YAAY,QAAQ;AACvC,UAAM,UAAU,OAAO,MAAM;AAE7B,QAAI,CAAC,SAAS;AACZ,YAAM,IAAI,MAAM,+CAA+C;AAAA,IACjE;AAGA,UAAM,cAAc,GAAG,QAAQ,QAAQ,QAAQ,QAAQ,SAAS,EAAE,GAAG,QAAQ,eAAe,QAAQ,OAAO;AAG3G,QAAI,YAAY,SAAS,KAAK;AAC5B,YAAM,IAAI;AAAA,QACR,gCAAgC,YAAY,MAAM;AAAA,MACpD;AAAA,IACF;AAEA,YAAQ;AAAA,MACN,2CAAsC,QAAQ,KAAK,MAAM,YAAY,MAAM,YAAY,KAAK,MAAM,YAAY,MAAM,GAAG,EAAE,MAAM,CAAC;AAAA,IAClI;AAEA,WAAO;AAAA,EACT,SAAS,OAAO;AACd,YAAQ;AAAA,MACN,wCAAwC,GAAG;AAAA,MAC3C,iBAAiB,QAAQ,MAAM,UAAU;AAAA,IAC3C;AACA,WAAO;AAAA,EACT;AACF;AACO,SAAS,IACd,SACA,UAGI,CAAC,GACL;AACA,QAAM,WAAW,OAAO,OAAO;AAC/B,QAAM,EAAE,WAAW,IAAI,oBAAoB,MAAM,IAAI;AAErD,SAAO;AAAA,IACL;AAAA,IACA,cAAc,sEAAsE,OAAO;AAAA;AAAA;AAAA;AAAA,QAIvF,oBAAoB,gGAAgG,kDAAkD;AAAA,IAC1K,SAAS,mBAAmB;AAC1B,YAAM,OAAO,MAAM,aAAa,OAAO;AAEvC,YAAM,cAAc,aAAa,KAAK,KAAK;AAAA,eAClC,KAAK,WAAW;AAAA,WACpB,KAAK,IAAI;AAAA,YACR,KAAK,QAAQ;AAAA,gBACT,KAAK,aAAa;AAAA,eACnB,KAAK,MAAM,MAAM;AAAA;AAAA,sBAEV,KAAK,WAAW;AAChC,YAAM;AAAA,QACJ,IAAI;AAAA,QACJ,SAAS,YAAY;AAAA,MACvB;AAGA,YAAM,iBAAiB,KAAK,MAAM,MAAM,GAAG,QAAQ;AACnD,iBAAW,QAAQ,gBAAgB;AACjC,cAAM,aACJ,KAAK,QAAQ,KAAK,QAAQ,GAAG,KAAK,KAAK,IAAI,KAAK,OAAO;AACzD,cAAM;AAAA,UACJ,IAAI;AAAA,UACJ,SAAS,YAAY;AAEnB,gBAAI,iBAAiB,KAAK,kBAAkB,KAAK;AACjD,gBAAI,qBAAqB,KAAK,MAAM;AAClC,oBAAM,cAAc,MAAM,oBAAoB,KAAK,IAAI;AACvD,kBAAI,eAAe,YAAY,SAAS,eAAe,QAAQ;AAC7D,iCAAiB;AAAA,cACnB;AAAA,YACF;AACA,mBAAO,UAAU,KAAK,KAAK;AAAA,UAC7B,KAAK,MAAM;AAAA,aACR,KAAK,OAAO;AAAA,cACX,KAAK,WAAW,KAAK,IAAI,CAAC;AAAA,QAChC,KAAK,IAAI;AAAA,EACf,oBAAoB,0BAA0B,UAAU;AAAA,EACxD,cAAc;AAAA;AAAA,WAEL,KAAK,KAAK,MAAM,KAAK,WAAW;AAAA,UACjC;AAAA,QACF;AAAA,MACF;AAAA,IACF;AAAA,EACF;AACF;",
|
|
6
|
+
"names": []
|
|
7
|
+
}
|
|
@@ -0,0 +1,11 @@
|
|
|
1
|
+
import { EmbeddingModel } from 'fastembed';
|
|
2
|
+
import type { Embedder } from '../stores/store.js';
|
|
3
|
+
type StandardModel = EmbeddingModel.AllMiniLML6V2 | EmbeddingModel.BGEBaseEN | EmbeddingModel.BGEBaseENV15 | EmbeddingModel.BGESmallEN | EmbeddingModel.BGESmallENV15 | EmbeddingModel.BGESmallZH | EmbeddingModel.MLE5Large;
|
|
4
|
+
export interface FastEmbedOptions {
|
|
5
|
+
model?: StandardModel;
|
|
6
|
+
batchSize?: number;
|
|
7
|
+
cacheDir?: string;
|
|
8
|
+
}
|
|
9
|
+
export declare function fastembed(options?: FastEmbedOptions): Embedder;
|
|
10
|
+
export {};
|
|
11
|
+
//# sourceMappingURL=fastembed.d.ts.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"fastembed.d.ts","sourceRoot":"","sources":["../../../src/lib/embedders/fastembed.ts"],"names":[],"mappings":"AAAA,OAAO,EAAE,cAAc,EAAiB,MAAM,WAAW,CAAC;AAE1D,OAAO,KAAK,EAAE,QAAQ,EAAE,MAAM,oBAAoB,CAAC;AAEnD,KAAK,aAAa,GACd,cAAc,CAAC,aAAa,GAC5B,cAAc,CAAC,SAAS,GACxB,cAAc,CAAC,YAAY,GAC3B,cAAc,CAAC,UAAU,GACzB,cAAc,CAAC,aAAa,GAC5B,cAAc,CAAC,UAAU,GACzB,cAAc,CAAC,SAAS,CAAC;AAE7B,MAAM,WAAW,gBAAgB;IAC/B,KAAK,CAAC,EAAE,aAAa,CAAC;IACtB,SAAS,CAAC,EAAE,MAAM,CAAC;IACnB,QAAQ,CAAC,EAAE,MAAM,CAAC;CACnB;AAED,wBAAgB,SAAS,CAAC,OAAO,GAAE,gBAAqB,GAAG,QAAQ,CAoClE"}
|
|
@@ -0,0 +1,35 @@
|
|
|
1
|
+
import { EmbeddingModel, FlagEmbedding } from "fastembed";
|
|
2
|
+
function fastembed(options = {}) {
|
|
3
|
+
const {
|
|
4
|
+
model: modelId = EmbeddingModel.BGESmallENV15,
|
|
5
|
+
batchSize,
|
|
6
|
+
cacheDir
|
|
7
|
+
} = options;
|
|
8
|
+
let modelPromise = null;
|
|
9
|
+
const getModel = () => {
|
|
10
|
+
if (!modelPromise) {
|
|
11
|
+
modelPromise = FlagEmbedding.init({
|
|
12
|
+
model: modelId,
|
|
13
|
+
cacheDir
|
|
14
|
+
});
|
|
15
|
+
}
|
|
16
|
+
return modelPromise;
|
|
17
|
+
};
|
|
18
|
+
return async (documents) => {
|
|
19
|
+
const model = await getModel();
|
|
20
|
+
const batches = model.embed(documents, batchSize);
|
|
21
|
+
const embeddings = [];
|
|
22
|
+
let dimensions = 0;
|
|
23
|
+
for await (const batch of batches) {
|
|
24
|
+
for (const vec of batch) {
|
|
25
|
+
if (dimensions === 0) dimensions = vec.length;
|
|
26
|
+
embeddings.push(vec);
|
|
27
|
+
}
|
|
28
|
+
}
|
|
29
|
+
return { embeddings, dimensions };
|
|
30
|
+
};
|
|
31
|
+
}
|
|
32
|
+
export {
|
|
33
|
+
fastembed
|
|
34
|
+
};
|
|
35
|
+
//# sourceMappingURL=fastembed.js.map
|
|
@@ -0,0 +1,7 @@
|
|
|
1
|
+
{
|
|
2
|
+
"version": 3,
|
|
3
|
+
"sources": ["../../../src/lib/embedders/fastembed.ts"],
|
|
4
|
+
"sourcesContent": ["import { EmbeddingModel, FlagEmbedding } from 'fastembed';\n\nimport type { Embedder } from '../stores/store.js';\n\ntype StandardModel =\n | EmbeddingModel.AllMiniLML6V2\n | EmbeddingModel.BGEBaseEN\n | EmbeddingModel.BGEBaseENV15\n | EmbeddingModel.BGESmallEN\n | EmbeddingModel.BGESmallENV15\n | EmbeddingModel.BGESmallZH\n | EmbeddingModel.MLE5Large;\n\nexport interface FastEmbedOptions {\n model?: StandardModel;\n batchSize?: number;\n cacheDir?: string;\n}\n\nexport function fastembed(options: FastEmbedOptions = {}): Embedder {\n const {\n model: modelId = EmbeddingModel.BGESmallENV15,\n batchSize,\n cacheDir,\n } = options;\n\n let modelPromise: Promise<\n Awaited<ReturnType<typeof FlagEmbedding.init>>\n > | null = null;\n const getModel = () => {\n if (!modelPromise) {\n modelPromise = FlagEmbedding.init({\n model: modelId,\n cacheDir,\n });\n }\n return modelPromise;\n };\n\n return async (documents: string[]) => {\n const model = await getModel();\n const batches = model.embed(documents, batchSize);\n\n const embeddings: number[][] = [];\n let dimensions = 0;\n\n for await (const batch of batches) {\n for (const vec of batch) {\n if (dimensions === 0) dimensions = vec.length;\n embeddings.push(vec);\n }\n }\n\n return { embeddings, dimensions };\n };\n}\n"],
|
|
5
|
+
"mappings": "AAAA,SAAS,gBAAgB,qBAAqB;AAmBvC,SAAS,UAAU,UAA4B,CAAC,GAAa;AAClE,QAAM;AAAA,IACJ,OAAO,UAAU,eAAe;AAAA,IAChC;AAAA,IACA;AAAA,EACF,IAAI;AAEJ,MAAI,eAEO;AACX,QAAM,WAAW,MAAM;AACrB,QAAI,CAAC,cAAc;AACjB,qBAAe,cAAc,KAAK;AAAA,QAChC,OAAO;AAAA,QACP;AAAA,MACF,CAAC;AAAA,IACH;AACA,WAAO;AAAA,EACT;AAEA,SAAO,OAAO,cAAwB;AACpC,UAAM,QAAQ,MAAM,SAAS;AAC7B,UAAM,UAAU,MAAM,MAAM,WAAW,SAAS;AAEhD,UAAM,aAAyB,CAAC;AAChC,QAAI,aAAa;AAEjB,qBAAiB,SAAS,SAAS;AACjC,iBAAW,OAAO,OAAO;AACvB,YAAI,eAAe,EAAG,cAAa,IAAI;AACvC,mBAAW,KAAK,GAAG;AAAA,MACrB;AAAA,IACF;AAEA,WAAO,EAAE,YAAY,WAAW;AAAA,EAClC;AACF;",
|
|
6
|
+
"names": []
|
|
7
|
+
}
|
|
@@ -0,0 +1,14 @@
|
|
|
1
|
+
import type { FeatureExtractionPipeline, Tensor } from '@huggingface/transformers';
|
|
2
|
+
import type { Embedder } from '../stores/store.js';
|
|
3
|
+
export type FeatureExtractionFn = FeatureExtractionPipeline;
|
|
4
|
+
export interface EmbeddingOptions {
|
|
5
|
+
extractorFn: () => Promise<FeatureExtractionPipeline> | FeatureExtractionPipeline;
|
|
6
|
+
pooling?: 'mean' | 'cls';
|
|
7
|
+
normalize?: boolean;
|
|
8
|
+
}
|
|
9
|
+
export declare function tensorToEmbeddings(tensor: Tensor): {
|
|
10
|
+
embeddings: Float32Array<ArrayBufferLike>[];
|
|
11
|
+
dimensions: number;
|
|
12
|
+
};
|
|
13
|
+
export declare function huggingface(extractorFn: () => Promise<FeatureExtractionPipeline> | FeatureExtractionPipeline): Embedder;
|
|
14
|
+
//# sourceMappingURL=huggingface.d.ts.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"huggingface.d.ts","sourceRoot":"","sources":["../../../src/lib/embedders/huggingface.ts"],"names":[],"mappings":"AAAA,OAAO,KAAK,EACV,yBAAyB,EACzB,MAAM,EACP,MAAM,2BAA2B,CAAC;AAEnC,OAAO,KAAK,EAAE,QAAQ,EAAE,MAAM,oBAAoB,CAAC;AAEnD,MAAM,MAAM,mBAAmB,GAAG,yBAAyB,CAAC;AAE5D,MAAM,WAAW,gBAAgB;IAC/B,WAAW,EAAE,MACT,OAAO,CAAC,yBAAyB,CAAC,GAClC,yBAAyB,CAAC;IAC9B,OAAO,CAAC,EAAE,MAAM,GAAG,KAAK,CAAC;IACzB,SAAS,CAAC,EAAE,OAAO,CAAC;CACrB;AAeD,wBAAgB,kBAAkB,CAAC,MAAM,EAAE,MAAM;;;EAuBhD;AAED,wBAAgB,WAAW,CACzB,WAAW,EAAE,MACT,OAAO,CAAC,yBAAyB,CAAC,GAClC,yBAAyB,GAC5B,QAAQ,CAEV"}
|
|
@@ -0,0 +1,40 @@
|
|
|
1
|
+
async function textEmbeddings(inputs, { extractorFn, pooling, normalize }) {
|
|
2
|
+
inputs = (Array.isArray(inputs) ? inputs : [inputs]).map((it) => it.trim());
|
|
3
|
+
const extractor = await extractorFn();
|
|
4
|
+
const tensor = await extractor(inputs, {
|
|
5
|
+
pooling: pooling ?? "mean",
|
|
6
|
+
normalize: normalize ?? true
|
|
7
|
+
});
|
|
8
|
+
return tensorToEmbeddings(tensor);
|
|
9
|
+
}
|
|
10
|
+
function tensorToEmbeddings(tensor) {
|
|
11
|
+
const dims = tensor.dims;
|
|
12
|
+
if (!Array.isArray(dims) || dims.length < 2) {
|
|
13
|
+
throw new Error(`Unexpected tensor dims: ${JSON.stringify(dims)}`);
|
|
14
|
+
}
|
|
15
|
+
const batchSize = dims[0];
|
|
16
|
+
const hiddenSize = dims[dims.length - 1];
|
|
17
|
+
const expectedLen = batchSize * hiddenSize;
|
|
18
|
+
if (tensor.data.length !== expectedLen) {
|
|
19
|
+
throw new Error(
|
|
20
|
+
`Data length mismatch: got ${tensor.data.length}, expected ${expectedLen} (batch=${batchSize}, hidden=${hiddenSize})`
|
|
21
|
+
);
|
|
22
|
+
}
|
|
23
|
+
const embeddings = [];
|
|
24
|
+
const dimensions = tensor.dims[tensor.dims.length - 1];
|
|
25
|
+
for (let i = 0; i < batchSize; i++) {
|
|
26
|
+
const start = i * hiddenSize;
|
|
27
|
+
embeddings.push(
|
|
28
|
+
tensor.data.subarray(start, start + hiddenSize)
|
|
29
|
+
);
|
|
30
|
+
}
|
|
31
|
+
return { embeddings, dimensions };
|
|
32
|
+
}
|
|
33
|
+
function huggingface(extractorFn) {
|
|
34
|
+
return (documents) => textEmbeddings(documents, { extractorFn });
|
|
35
|
+
}
|
|
36
|
+
export {
|
|
37
|
+
huggingface,
|
|
38
|
+
tensorToEmbeddings
|
|
39
|
+
};
|
|
40
|
+
//# sourceMappingURL=huggingface.js.map
|
|
@@ -0,0 +1,7 @@
|
|
|
1
|
+
{
|
|
2
|
+
"version": 3,
|
|
3
|
+
"sources": ["../../../src/lib/embedders/huggingface.ts"],
|
|
4
|
+
"sourcesContent": ["import type {\n FeatureExtractionPipeline,\n Tensor,\n} from '@huggingface/transformers';\n\nimport type { Embedder } from '../stores/store.js';\n\nexport type FeatureExtractionFn = FeatureExtractionPipeline;\n\nexport interface EmbeddingOptions {\n extractorFn: () =>\n | Promise<FeatureExtractionPipeline>\n | FeatureExtractionPipeline;\n pooling?: 'mean' | 'cls';\n normalize?: boolean;\n}\n\nasync function textEmbeddings(\n inputs: string | string[],\n { extractorFn, pooling, normalize }: EmbeddingOptions,\n) {\n inputs = (Array.isArray(inputs) ? inputs : [inputs]).map((it) => it.trim());\n const extractor = await extractorFn();\n const tensor = await extractor(inputs, {\n pooling: pooling ?? 'mean',\n normalize: normalize ?? true,\n });\n return tensorToEmbeddings(tensor);\n}\n\nexport function tensorToEmbeddings(tensor: Tensor) {\n const dims = tensor.dims;\n if (!Array.isArray(dims) || dims.length < 2) {\n throw new Error(`Unexpected tensor dims: ${JSON.stringify(dims)}`);\n }\n const batchSize = dims[0];\n const hiddenSize = dims[dims.length - 1];\n const expectedLen = batchSize * hiddenSize;\n if (tensor.data.length !== expectedLen) {\n throw new Error(\n `Data length mismatch: got ${tensor.data.length}, expected ${expectedLen} (batch=${batchSize}, hidden=${hiddenSize})`,\n );\n }\n // Reuse the underlying typed array without copying; we'll copy when creating Buffer for SQLite\n const embeddings: Float32Array[] = [];\n const dimensions = tensor.dims[tensor.dims.length - 1];\n for (let i = 0; i < batchSize; i++) {\n const start = i * hiddenSize;\n embeddings.push(\n (tensor.data as Float32Array).subarray(start, start + hiddenSize),\n );\n }\n return { embeddings, dimensions };\n}\n\nexport function huggingface(\n extractorFn: () =>\n | Promise<FeatureExtractionPipeline>\n | FeatureExtractionPipeline,\n): Embedder {\n return (documents) => textEmbeddings(documents, { extractorFn });\n}\n"],
|
|
5
|
+
"mappings": "AAiBA,eAAe,eACb,QACA,EAAE,aAAa,SAAS,UAAU,GAClC;AACA,YAAU,MAAM,QAAQ,MAAM,IAAI,SAAS,CAAC,MAAM,GAAG,IAAI,CAAC,OAAO,GAAG,KAAK,CAAC;AAC1E,QAAM,YAAY,MAAM,YAAY;AACpC,QAAM,SAAS,MAAM,UAAU,QAAQ;AAAA,IACrC,SAAS,WAAW;AAAA,IACpB,WAAW,aAAa;AAAA,EAC1B,CAAC;AACD,SAAO,mBAAmB,MAAM;AAClC;AAEO,SAAS,mBAAmB,QAAgB;AACjD,QAAM,OAAO,OAAO;AACpB,MAAI,CAAC,MAAM,QAAQ,IAAI,KAAK,KAAK,SAAS,GAAG;AAC3C,UAAM,IAAI,MAAM,2BAA2B,KAAK,UAAU,IAAI,CAAC,EAAE;AAAA,EACnE;AACA,QAAM,YAAY,KAAK,CAAC;AACxB,QAAM,aAAa,KAAK,KAAK,SAAS,CAAC;AACvC,QAAM,cAAc,YAAY;AAChC,MAAI,OAAO,KAAK,WAAW,aAAa;AACtC,UAAM,IAAI;AAAA,MACR,6BAA6B,OAAO,KAAK,MAAM,cAAc,WAAW,WAAW,SAAS,YAAY,UAAU;AAAA,IACpH;AAAA,EACF;AAEA,QAAM,aAA6B,CAAC;AACpC,QAAM,aAAa,OAAO,KAAK,OAAO,KAAK,SAAS,CAAC;AACrD,WAAS,IAAI,GAAG,IAAI,WAAW,KAAK;AAClC,UAAM,QAAQ,IAAI;AAClB,eAAW;AAAA,MACR,OAAO,KAAsB,SAAS,OAAO,QAAQ,UAAU;AAAA,IAClE;AAAA,EACF;AACA,SAAO,EAAE,YAAY,WAAW;AAClC;AAEO,SAAS,YACd,aAGU;AACV,SAAO,CAAC,cAAc,eAAe,WAAW,EAAE,YAAY,CAAC;AACjE;",
|
|
6
|
+
"names": []
|
|
7
|
+
}
|
|
@@ -0,0 +1,24 @@
|
|
|
1
|
+
import type { Connector } from './connectors/connector.js';
|
|
2
|
+
import type { Splitter } from './pipeline.js';
|
|
3
|
+
import type { Embedder, Store } from './stores/store.js';
|
|
4
|
+
export interface IngestionConfig {
|
|
5
|
+
connector: Connector;
|
|
6
|
+
store: Store;
|
|
7
|
+
splitter?: Splitter;
|
|
8
|
+
embedder: Embedder;
|
|
9
|
+
}
|
|
10
|
+
export declare function ingest(config: IngestionConfig, callback?: (it: string) => void): Promise<void>;
|
|
11
|
+
export type ChunkPosition = {
|
|
12
|
+
startLine: number;
|
|
13
|
+
startColumn: number;
|
|
14
|
+
endLine: number;
|
|
15
|
+
endColumn: number;
|
|
16
|
+
};
|
|
17
|
+
export type SplitChunkWithPosition = {
|
|
18
|
+
content: string;
|
|
19
|
+
position: ChunkPosition | null;
|
|
20
|
+
index: number;
|
|
21
|
+
};
|
|
22
|
+
export declare function splitTypeScriptWithPositions(id: string, content: string): Promise<SplitChunkWithPosition[]>;
|
|
23
|
+
export declare function splitTypeScript(id: string, content: string): Promise<string[]>;
|
|
24
|
+
//# sourceMappingURL=ingest.d.ts.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"ingest.d.ts","sourceRoot":"","sources":["../../src/lib/ingest.ts"],"names":[],"mappings":"AAKA,OAAO,KAAK,EAAE,SAAS,EAAE,MAAM,2BAA2B,CAAC;AAC3D,OAAO,KAAK,EAAE,QAAQ,EAAE,MAAM,eAAe,CAAC;AAE9C,OAAO,KAAK,EAAE,QAAQ,EAAE,KAAK,EAAE,MAAM,mBAAmB,CAAC;AAEzD,MAAM,WAAW,eAAe;IAC9B,SAAS,EAAE,SAAS,CAAC;IACrB,KAAK,EAAE,KAAK,CAAC;IACb,QAAQ,CAAC,EAAE,QAAQ,CAAC;IACpB,QAAQ,EAAE,QAAQ,CAAC;CACpB;AAED,wBAAsB,MAAM,CAC1B,MAAM,EAAE,eAAe,EACvB,QAAQ,CAAC,EAAE,CAAC,EAAE,EAAE,MAAM,KAAK,IAAI,iBAkChC;AAOD,MAAM,MAAM,aAAa,GAAG;IAC1B,SAAS,EAAE,MAAM,CAAC;IAClB,WAAW,EAAE,MAAM,CAAC;IACpB,OAAO,EAAE,MAAM,CAAC;IAChB,SAAS,EAAE,MAAM,CAAC;CACnB,CAAC;AAEF,MAAM,MAAM,sBAAsB,GAAG;IACnC,OAAO,EAAE,MAAM,CAAC;IAChB,QAAQ,EAAE,aAAa,GAAG,IAAI,CAAC;IAC/B,KAAK,EAAE,MAAM,CAAC;CACf,CAAC;AA+EF,wBAAsB,4BAA4B,CAChD,EAAE,EAAE,MAAM,EACV,OAAO,EAAE,MAAM,GACd,OAAO,CAAC,sBAAsB,EAAE,CAAC,CAQnC;AAED,wBAAsB,eAAe,CAAC,EAAE,EAAE,MAAM,EAAE,OAAO,EAAE,MAAM,qBAGhE"}
|
|
@@ -0,0 +1,111 @@
|
|
|
1
|
+
import {
|
|
2
|
+
MarkdownTextSplitter,
|
|
3
|
+
RecursiveCharacterTextSplitter
|
|
4
|
+
} from "langchain/text_splitter";
|
|
5
|
+
import { cid } from "./stores/cid.js";
|
|
6
|
+
async function ingest(config, callback) {
|
|
7
|
+
const splitter = config.splitter ?? split;
|
|
8
|
+
const embedder = config.embedder;
|
|
9
|
+
const corpuses = config.connector.sources();
|
|
10
|
+
for await (const it of corpuses) {
|
|
11
|
+
callback?.(it.id);
|
|
12
|
+
const content = await it.content();
|
|
13
|
+
if (!content.trim()) {
|
|
14
|
+
continue;
|
|
15
|
+
}
|
|
16
|
+
await config.store.index(config.connector.sourceId, {
|
|
17
|
+
id: it.id,
|
|
18
|
+
cid: cid(content),
|
|
19
|
+
metadata: it.metadata,
|
|
20
|
+
chunker: async function* () {
|
|
21
|
+
const values = await splitter(it.id, content);
|
|
22
|
+
const batchSize = 40;
|
|
23
|
+
for (let i = 0; i < values.length; i += batchSize) {
|
|
24
|
+
const batch = values.slice(i, i + batchSize);
|
|
25
|
+
const { embeddings } = await embedder(batch);
|
|
26
|
+
for (let j = 0; j < embeddings.length; j++) {
|
|
27
|
+
yield {
|
|
28
|
+
content: batch[j],
|
|
29
|
+
embedding: embeddings[j]
|
|
30
|
+
};
|
|
31
|
+
}
|
|
32
|
+
}
|
|
33
|
+
}
|
|
34
|
+
});
|
|
35
|
+
}
|
|
36
|
+
}
|
|
37
|
+
function split(id, content) {
|
|
38
|
+
const splitter = new MarkdownTextSplitter();
|
|
39
|
+
return splitter.splitText(content);
|
|
40
|
+
}
|
|
41
|
+
function normalizeNewlines(value) {
|
|
42
|
+
return value.replace(/\r\n/g, "\n");
|
|
43
|
+
}
|
|
44
|
+
function computePositions(originalContent, chunks) {
|
|
45
|
+
if (!chunks.length) {
|
|
46
|
+
return [];
|
|
47
|
+
}
|
|
48
|
+
const normalizedContent = normalizeNewlines(originalContent);
|
|
49
|
+
const positions = [];
|
|
50
|
+
let searchOffset = 0;
|
|
51
|
+
for (const chunk of chunks) {
|
|
52
|
+
const normalizedChunk = normalizeNewlines(chunk);
|
|
53
|
+
const trimmedChunk = normalizedChunk.trim();
|
|
54
|
+
const seek = (needle, fromIndex) => needle ? normalizedContent.indexOf(needle, fromIndex) : -1;
|
|
55
|
+
let matchIndex = seek(normalizedChunk, searchOffset);
|
|
56
|
+
let matchValue = normalizedChunk;
|
|
57
|
+
if (matchIndex === -1 && trimmedChunk) {
|
|
58
|
+
matchIndex = seek(trimmedChunk, searchOffset);
|
|
59
|
+
matchValue = trimmedChunk;
|
|
60
|
+
}
|
|
61
|
+
if (matchIndex === -1) {
|
|
62
|
+
matchIndex = seek(normalizedChunk, 0);
|
|
63
|
+
matchValue = normalizedChunk;
|
|
64
|
+
}
|
|
65
|
+
if (matchIndex === -1 && trimmedChunk) {
|
|
66
|
+
matchIndex = seek(trimmedChunk, 0);
|
|
67
|
+
matchValue = trimmedChunk;
|
|
68
|
+
}
|
|
69
|
+
if (matchIndex === -1) {
|
|
70
|
+
positions.push(null);
|
|
71
|
+
continue;
|
|
72
|
+
}
|
|
73
|
+
const before = normalizedContent.slice(0, matchIndex);
|
|
74
|
+
const beforeLines = before.split("\n");
|
|
75
|
+
const startLine = beforeLines.length;
|
|
76
|
+
const startColumn = beforeLines[beforeLines.length - 1].length + 1;
|
|
77
|
+
const lines = matchValue.split("\n");
|
|
78
|
+
const endLine = startLine + lines.length - 1;
|
|
79
|
+
const endColumn = lines.length === 1 ? startColumn + lines[0].length : lines[lines.length - 1].length + 1;
|
|
80
|
+
positions.push({ startLine, startColumn, endLine, endColumn });
|
|
81
|
+
searchOffset = matchIndex + matchValue.length;
|
|
82
|
+
}
|
|
83
|
+
return positions;
|
|
84
|
+
}
|
|
85
|
+
function buildChunksWithPositions(originalContent, chunks) {
|
|
86
|
+
const positions = computePositions(originalContent, chunks);
|
|
87
|
+
return chunks.map((content, index) => ({
|
|
88
|
+
content,
|
|
89
|
+
index,
|
|
90
|
+
position: positions[index] ?? null
|
|
91
|
+
}));
|
|
92
|
+
}
|
|
93
|
+
async function splitTypeScriptWithPositions(id, content) {
|
|
94
|
+
const splitter = RecursiveCharacterTextSplitter.fromLanguage("js", {
|
|
95
|
+
chunkSize: 512,
|
|
96
|
+
chunkOverlap: 100
|
|
97
|
+
});
|
|
98
|
+
const docs = await splitter.createDocuments([content]);
|
|
99
|
+
const chunks = docs.map((d) => d.pageContent);
|
|
100
|
+
return buildChunksWithPositions(content, chunks);
|
|
101
|
+
}
|
|
102
|
+
async function splitTypeScript(id, content) {
|
|
103
|
+
const chunks = await splitTypeScriptWithPositions(id, content);
|
|
104
|
+
return chunks.map((chunk) => chunk.content);
|
|
105
|
+
}
|
|
106
|
+
export {
|
|
107
|
+
ingest,
|
|
108
|
+
splitTypeScript,
|
|
109
|
+
splitTypeScriptWithPositions
|
|
110
|
+
};
|
|
111
|
+
//# sourceMappingURL=ingest.js.map
|