agentic-knowledge-mcp 1.0.17 → 1.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "agentic-knowledge-mcp",
3
- "version": "1.0.17",
3
+ "version": "1.1.0",
4
4
  "description": "A Model Context Protocol server for agentic knowledge guidance with web-based documentation loading and intelligent search instructions",
5
5
  "type": "module",
6
6
  "main": "packages/cli/dist/index.js",
@@ -29,9 +29,9 @@
29
29
  "commander": "^12.0.0",
30
30
  "js-yaml": "4.1.0",
31
31
  "ora": "^8.0.1",
32
- "@codemcp/knowledge": "1.0.17",
33
- "@codemcp/knowledge-content-loader": "1.0.17",
34
- "@codemcp/knowledge-core": "1.0.17"
32
+ "@codemcp/knowledge": "1.1.0",
33
+ "@codemcp/knowledge-content-loader": "1.1.0",
34
+ "@codemcp/knowledge-core": "1.1.0"
35
35
  },
36
36
  "devDependencies": {
37
37
  "@eslint/js": "^9.34.0",
@@ -5,15 +5,16 @@ import { Command } from "commander";
5
5
  import chalk from "chalk";
6
6
  import { promises as fs } from "node:fs";
7
7
  import * as path from "node:path";
8
+ import { URL } from "node:url";
8
9
  import { ConfigManager } from "@codemcp/knowledge-core";
9
10
  export const createCommand = new Command("create")
10
11
  .description("Create a new docset using presets")
11
- .requiredOption("--preset <type>", "Preset type: git-repo or local-folder")
12
+ .requiredOption("--preset <type>", "Preset type: git-repo, local-folder, or archive")
12
13
  .requiredOption("--id <id>", "Unique docset ID")
13
14
  .requiredOption("--name <name>", "Human-readable docset name")
14
15
  .option("--description <desc>", "Docset description")
15
- .option("--url <url>", "Git repository URL (required for git-repo preset)")
16
- .option("--path <path>", "Local folder path (required for local-folder preset)")
16
+ .option("--url <url>", "Git repository URL (git-repo) or archive file URL (archive preset)")
17
+ .option("--path <path>", "Local folder path (local-folder) or local archive file path (archive preset)")
17
18
  .option("--branch <branch>", "Git branch (default: main)", "main")
18
19
  .action(async (options) => {
19
20
  try {
@@ -47,8 +48,11 @@ export const createCommand = new Command("create")
47
48
  else if (options.preset === "local-folder") {
48
49
  newDocset = await createLocalFolderDocset(options);
49
50
  }
51
+ else if (options.preset === "archive") {
52
+ newDocset = await createArchiveDocset(options);
53
+ }
50
54
  else {
51
- throw new Error(`Unknown preset: ${options.preset}. Use 'git-repo' or 'local-folder'`);
55
+ throw new Error(`Unknown preset: ${options.preset}. Use 'git-repo', 'local-folder', or 'archive'`);
52
56
  }
53
57
  // Add to config
54
58
  config.docsets.push(newDocset);
@@ -111,3 +115,54 @@ async function createLocalFolderDocset(options) {
111
115
  ],
112
116
  };
113
117
  }
118
+ async function createArchiveDocset(options) {
119
+ if (!options.path && !options.url) {
120
+ throw new Error("Either --path or --url is required for archive preset");
121
+ }
122
+ // If path is provided, validate it exists
123
+ if (options.path) {
124
+ const fullPath = path.resolve(options.path);
125
+ try {
126
+ const stat = await fs.stat(fullPath);
127
+ if (!stat.isFile()) {
128
+ throw new Error(`Path is not a file: ${options.path}`);
129
+ }
130
+ const lowerPath = options.path.toLowerCase();
131
+ if (!lowerPath.endsWith(".zip") &&
132
+ !lowerPath.endsWith(".tar.gz") &&
133
+ !lowerPath.endsWith(".tgz")) {
134
+ throw new Error(`File is not a supported archive format (zip, tar.gz): ${options.path}`);
135
+ }
136
+ }
137
+ catch {
138
+ throw new Error(`Path does not exist or is invalid: ${options.path}`);
139
+ }
140
+ }
141
+ // If URL is provided, validate it's a valid URL
142
+ if (options.url) {
143
+ try {
144
+ new URL(options.url);
145
+ }
146
+ catch {
147
+ throw new Error(`Invalid URL format: ${options.url}`);
148
+ }
149
+ }
150
+ const source = {
151
+ type: "archive",
152
+ };
153
+ if (options.path) {
154
+ source.path = options.path;
155
+ }
156
+ if (options.url) {
157
+ source.url = options.url;
158
+ }
159
+ if (options.paths) {
160
+ source.paths = options.paths.split(",");
161
+ }
162
+ return {
163
+ id: options.id,
164
+ name: options.name,
165
+ description: options.description || `Archive: ${options.path || options.url}`,
166
+ sources: [source],
167
+ };
168
+ }
@@ -6,7 +6,7 @@ import chalk from "chalk";
6
6
  import { promises as fs } from "node:fs";
7
7
  import * as path from "node:path";
8
8
  import { ConfigManager, calculateLocalPath, ensureKnowledgeGitignoreSync, discoverDirectoryPatterns, safelyClearDirectory, getDirectoryInfo, } from "@codemcp/knowledge-core";
9
- import { GitRepoLoader, WebSourceType, } from "@codemcp/knowledge-content-loader";
9
+ import { GitRepoLoader, ArchiveLoader, WebSourceType, } from "@codemcp/knowledge-content-loader";
10
10
  export const initCommand = new Command("init")
11
11
  .description("Initialize sources for a docset from configuration")
12
12
  .argument("<docset-id>", "ID of the docset to initialize")
@@ -169,6 +169,44 @@ export const initCommand = new Command("init")
169
169
  };
170
170
  await fs.writeFile(path.join(localPath, `.agentic-source-${index}.json`), JSON.stringify(metadata, null, 2));
171
171
  }
172
+ else if (source.type === "archive") {
173
+ // Handle archive file initialization (zip, tar.gz, etc.)
174
+ const loader = new ArchiveLoader();
175
+ const sourceUrl = source.url || source.path || "";
176
+ console.log(chalk.gray(` Using ArchiveLoader for archive extraction`));
177
+ const webSourceConfig = {
178
+ url: sourceUrl,
179
+ type: WebSourceType.ARCHIVE,
180
+ options: {
181
+ paths: source.paths || [],
182
+ },
183
+ };
184
+ // Validate configuration
185
+ const validation = loader.validateConfig(webSourceConfig);
186
+ if (validation !== true) {
187
+ throw new Error(`Invalid archive source configuration: ${validation}`);
188
+ }
189
+ // Load content using ArchiveLoader
190
+ const result = await loader.load(webSourceConfig, localPath);
191
+ if (!result.success) {
192
+ throw new Error(`Archive loading failed: ${result.error}`);
193
+ }
194
+ // Collect discovered paths for config update
195
+ allDiscoveredPaths.push(...result.files);
196
+ totalFiles += result.files.length;
197
+ console.log(chalk.green(` ✅ Extracted ${result.files.length} files from archive`));
198
+ // Create source metadata
199
+ const metadata = {
200
+ source_url: sourceUrl,
201
+ source_type: source.type,
202
+ downloaded_at: new Date().toISOString(),
203
+ files_count: result.files.length,
204
+ files: result.files,
205
+ docset_id: docsetId,
206
+ content_hash: result.contentHash,
207
+ };
208
+ await fs.writeFile(path.join(localPath, `.agentic-source-${index}.json`), JSON.stringify(metadata, null, 2));
209
+ }
172
210
  else {
173
211
  console.log(chalk.red(` ❌ Source type '${source.type}' not yet supported`));
174
212
  }
@@ -8,6 +8,7 @@ import { promises as fs } from "node:fs";
8
8
  import * as path from "node:path";
9
9
  import { execSync } from "node:child_process";
10
10
  import { findConfigPathSync, loadConfigSync, calculateLocalPath, ensureKnowledgeGitignoreSync, } from "@codemcp/knowledge-core";
11
+ import { ArchiveLoader, WebSourceType, } from "@codemcp/knowledge-content-loader";
11
12
  export const refreshCommand = new Command("refresh")
12
13
  .description("Refresh sources for docsets")
13
14
  .argument("[docset-id]", "ID of specific docset to refresh (refresh all if not specified)")
@@ -95,6 +96,11 @@ async function refreshDocset(docset, configPath, force) {
95
96
  totalFiles += sourceFiles.files_count;
96
97
  refreshedSources.push(sourceFiles);
97
98
  }
99
+ else if (source.type === "archive") {
100
+ const sourceFiles = await refreshArchiveSource(source, localPath, index, docset.id, force);
101
+ totalFiles += sourceFiles.files_count;
102
+ refreshedSources.push(sourceFiles);
103
+ }
98
104
  else {
99
105
  console.log(chalk.yellow(` ⚠️ Source type '${source.type}' not yet supported, skipping`));
100
106
  }
@@ -238,6 +244,76 @@ async function refreshGitSource(webSource, localPath, index, docsetId, force) {
238
244
  await fs.rm(tempDir, { recursive: true, force: true });
239
245
  }
240
246
  }
247
+ async function refreshArchiveSource(source, localPath, index, docsetId, force) {
248
+ const sourceMetadataPath = path.join(localPath, `.agentic-source-${index}.json`);
249
+ let existingSourceMetadata = null;
250
+ try {
251
+ const content = await fs.readFile(sourceMetadataPath, "utf8");
252
+ existingSourceMetadata = JSON.parse(content);
253
+ }
254
+ catch {
255
+ // No existing metadata, will do full refresh
256
+ }
257
+ const sourceUrl = source.url || source.path || "";
258
+ const loader = new ArchiveLoader();
259
+ const webSourceConfig = {
260
+ url: sourceUrl,
261
+ type: WebSourceType.ARCHIVE,
262
+ options: {
263
+ paths: source.paths || [],
264
+ },
265
+ };
266
+ // Check if content has changed
267
+ if (!force && existingSourceMetadata) {
268
+ try {
269
+ const currentId = await loader.getContentId(webSourceConfig);
270
+ const lastHash = existingSourceMetadata.content_hash;
271
+ if (lastHash === currentId) {
272
+ const updatedMetadata = {
273
+ ...existingSourceMetadata,
274
+ downloaded_at: new Date().toISOString(),
275
+ };
276
+ await fs.writeFile(sourceMetadataPath, JSON.stringify(updatedMetadata, null, 2));
277
+ return updatedMetadata;
278
+ }
279
+ }
280
+ catch {
281
+ // Could not check, proceed with full refresh
282
+ }
283
+ }
284
+ // Remove old files from this source (if we have metadata)
285
+ if (existingSourceMetadata) {
286
+ for (const file of existingSourceMetadata.files) {
287
+ const filePath = path.join(localPath, file);
288
+ try {
289
+ await fs.unlink(filePath);
290
+ }
291
+ catch {
292
+ // File might already be deleted, ignore
293
+ }
294
+ }
295
+ }
296
+ // Load content
297
+ const result = await loader.load(webSourceConfig, localPath);
298
+ if (!result.success) {
299
+ throw new Error(`Archive refresh failed: ${result.error}`);
300
+ }
301
+ const metadata = {
302
+ source_url: sourceUrl,
303
+ source_type: "archive",
304
+ downloaded_at: new Date().toISOString(),
305
+ files_count: result.files.length,
306
+ files: result.files,
307
+ docset_id: docsetId,
308
+ };
309
+ // Store content hash for future change detection
310
+ const metadataWithHash = {
311
+ ...metadata,
312
+ content_hash: result.contentHash,
313
+ };
314
+ await fs.writeFile(sourceMetadataPath, JSON.stringify(metadataWithHash, null, 2));
315
+ return metadata;
316
+ }
241
317
  // Reuse utility functions from init.ts
242
318
  async function findMarkdownFiles(dir) {
243
319
  const files = [];
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "@codemcp/knowledge-cli",
3
- "version": "1.0.17",
3
+ "version": "1.1.0",
4
4
  "description": "Command-line interface for agentic knowledge web content management",
5
5
  "type": "module",
6
6
  "main": "dist/exports.js",
@@ -0,0 +1,87 @@
1
+ /**
2
+ * Archive file content loader (supports zip, tar.gz, etc.)
3
+ */
4
+ import { ContentLoader, type LoadResult } from "./loader.js";
5
+ import { WebSourceConfig } from "../types.js";
6
+ /**
7
+ * Content loader for archive files - zip, tar.gz, etc. (local or remote)
8
+ */
9
+ export declare class ArchiveLoader extends ContentLoader {
10
+ /**
11
+ * Check if this loader can handle the given web source type
12
+ */
13
+ canHandle(webSource: WebSourceConfig): boolean;
14
+ /**
15
+ * Validate the web source configuration
16
+ */
17
+ validateConfig(webSource: WebSourceConfig): true | string;
18
+ /**
19
+ * Load content from an archive file
20
+ */
21
+ load(webSource: WebSourceConfig, targetPath: string): Promise<LoadResult>;
22
+ /**
23
+ * Get content identifier for change detection
24
+ */
25
+ getContentId(webSource: WebSourceConfig): Promise<string>;
26
+ /**
27
+ * Get headers from remote URL using HEAD request
28
+ */
29
+ private getRemoteHeaders;
30
+ /**
31
+ * Determine if the source is a remote URL or local path
32
+ */
33
+ private isRemoteUrl;
34
+ /**
35
+ * Detect archive type based on file extension
36
+ */
37
+ private detectArchiveType;
38
+ /**
39
+ * Resolve the archive file path - download if remote, return as-is if local
40
+ */
41
+ private resolveArchiveFile;
42
+ /**
43
+ * Download an archive file from a remote URL
44
+ */
45
+ private downloadArchive;
46
+ /**
47
+ * Extract a zip file to a directory using adm-zip
48
+ */
49
+ private extractZip;
50
+ /**
51
+ * Extract a tar.gz file to a directory
52
+ */
53
+ private extractTarGz;
54
+ /**
55
+ * If the extracted contents have a single root directory and no files at root,
56
+ * move that directory's contents one level up.
57
+ */
58
+ private flattenSingleRoot;
59
+ /**
60
+ * Extract content from extracted archive to target directory
61
+ */
62
+ private extractContent;
63
+ /**
64
+ * Extract only documentation files from source directory
65
+ */
66
+ private extractDocumentationFiles;
67
+ /**
68
+ * Copy directory recursively
69
+ */
70
+ private copyDirectory;
71
+ /**
72
+ * Recursively scan all files in a directory
73
+ */
74
+ private scanAllFiles;
75
+ /**
76
+ * Generate content hash for change detection
77
+ */
78
+ private generateContentHash;
79
+ /**
80
+ * Create a temporary directory
81
+ */
82
+ private createTempDirectory;
83
+ /**
84
+ * Clean up temporary directory
85
+ */
86
+ private cleanupTempDirectory;
87
+ }
@@ -0,0 +1,391 @@
1
+ /**
2
+ * Archive file content loader (supports zip, tar.gz, etc.)
3
+ */
4
+ import { promises as fs } from "node:fs";
5
+ import * as path from "node:path";
6
+ import * as crypto from "node:crypto";
7
+ import https from "node:https";
8
+ import http from "node:http";
9
+ import { URL } from "node:url";
10
+ import AdmZip from "adm-zip";
11
+ import * as tar from "tar";
12
+ import { ContentLoader } from "./loader.js";
13
+ import { WebSourceType, WebSourceError, WebSourceErrorType, } from "../types.js";
14
+ import { filterDocumentationFiles } from "./file-filter.js";
15
+ /**
16
+ * Content loader for archive files - zip, tar.gz, etc. (local or remote)
17
+ */
18
+ export class ArchiveLoader extends ContentLoader {
19
+ /**
20
+ * Check if this loader can handle the given web source type
21
+ */
22
+ canHandle(webSource) {
23
+ return webSource.type === WebSourceType.ARCHIVE;
24
+ }
25
+ /**
26
+ * Validate the web source configuration
27
+ */
28
+ validateConfig(webSource) {
29
+ if (!webSource.url) {
30
+ return "Archive source must have a URL (remote) or local path";
31
+ }
32
+ return true;
33
+ }
34
+ /**
35
+ * Load content from an archive file
36
+ */
37
+ async load(webSource, targetPath) {
38
+ try {
39
+ const options = webSource.options;
40
+ const tempDir = await this.createTempDirectory();
41
+ try {
42
+ // Get the archive file (download if remote, or use local path)
43
+ const archiveFilePath = await this.resolveArchiveFile(webSource.url, tempDir);
44
+ // Detect archive type
45
+ const archiveType = this.detectArchiveType(archiveFilePath);
46
+ // Extract to temp directory
47
+ const extractDir = path.join(tempDir, "extracted");
48
+ await fs.mkdir(extractDir, { recursive: true });
49
+ if (archiveType === "zip") {
50
+ this.extractZip(archiveFilePath, extractDir);
51
+ }
52
+ else if (archiveType === "tar.gz") {
53
+ await this.extractTarGz(archiveFilePath, extractDir);
54
+ }
55
+ else {
56
+ throw new WebSourceError(WebSourceErrorType.ARCHIVE_ERROR, `Unsupported archive format. Supported formats: .zip, .tar.gz`, { archiveType });
57
+ }
58
+ // Flatten single root directory
59
+ await this.flattenSingleRoot(extractDir);
60
+ // Extract specified paths or all documentation content
61
+ const extractedFiles = await this.extractContent(extractDir, targetPath, options?.paths);
62
+ // Generate content hash
63
+ const contentHash = await this.generateContentHash(targetPath, extractedFiles);
64
+ return {
65
+ success: true,
66
+ files: extractedFiles,
67
+ contentHash,
68
+ };
69
+ }
70
+ finally {
71
+ await this.cleanupTempDirectory(tempDir);
72
+ }
73
+ }
74
+ catch (error) {
75
+ const errorMessage = error instanceof Error ? error.message : String(error);
76
+ return {
77
+ success: false,
78
+ files: [],
79
+ contentHash: "",
80
+ error: `Archive loading failed: ${errorMessage}`,
81
+ };
82
+ }
83
+ }
84
+ /**
85
+ * Get content identifier for change detection
86
+ */
87
+ async getContentId(webSource) {
88
+ try {
89
+ if (this.isRemoteUrl(webSource.url)) {
90
+ // For remote URLs, try HEAD request for ETag/Last-Modified
91
+ const headers = await this.getRemoteHeaders(webSource.url);
92
+ const etag = headers["etag"] || "";
93
+ const lastModified = headers["last-modified"] || "";
94
+ const identifier = etag || lastModified || webSource.url;
95
+ return crypto
96
+ .createHash("sha256")
97
+ .update(`${webSource.url}:${identifier}`)
98
+ .digest("hex");
99
+ }
100
+ else {
101
+ // For local files, hash the file content
102
+ const content = await fs.readFile(webSource.url);
103
+ return crypto.createHash("sha256").update(content).digest("hex");
104
+ }
105
+ }
106
+ catch {
107
+ // Fallback to URL-based hash
108
+ return crypto.createHash("sha256").update(webSource.url).digest("hex");
109
+ }
110
+ }
111
+ /**
112
+ * Get headers from remote URL using HEAD request
113
+ */
114
+ getRemoteHeaders(url) {
115
+ return new Promise((resolve, reject) => {
116
+ const protocol = url.startsWith("https") ? https : http;
117
+ const request = protocol.request(url, { method: "HEAD" }, (response) => {
118
+ const headers = {};
119
+ if (response.headers) {
120
+ for (const [key, value] of Object.entries(response.headers)) {
121
+ if (typeof value === "string") {
122
+ headers[key] = value;
123
+ }
124
+ else if (Array.isArray(value) && value.length > 0 && value[0]) {
125
+ headers[key] = value[0];
126
+ }
127
+ }
128
+ }
129
+ resolve(headers);
130
+ });
131
+ request.on("error", reject);
132
+ request.end();
133
+ });
134
+ }
135
+ /**
136
+ * Determine if the source is a remote URL or local path
137
+ */
138
+ isRemoteUrl(url) {
139
+ return url.startsWith("http://") || url.startsWith("https://");
140
+ }
141
+ /**
142
+ * Detect archive type based on file extension
143
+ */
144
+ detectArchiveType(filePath) {
145
+ const lowerPath = filePath.toLowerCase();
146
+ if (lowerPath.endsWith(".tar.gz") || lowerPath.endsWith(".tgz")) {
147
+ return "tar.gz";
148
+ }
149
+ if (lowerPath.endsWith(".zip")) {
150
+ return "zip";
151
+ }
152
+ return "unknown";
153
+ }
154
+ /**
155
+ * Resolve the archive file path - download if remote, return as-is if local
156
+ */
157
+ async resolveArchiveFile(url, tempDir) {
158
+ if (this.isRemoteUrl(url)) {
159
+ return this.downloadArchive(url, tempDir);
160
+ }
161
+ // Local file - verify it exists
162
+ try {
163
+ await fs.access(url);
164
+ return url;
165
+ }
166
+ catch {
167
+ throw new WebSourceError(WebSourceErrorType.ARCHIVE_ERROR, `Local archive file not found: ${url}`, { url });
168
+ }
169
+ }
170
+ /**
171
+ * Download an archive file from a remote URL
172
+ */
173
+ async downloadArchive(url, tempDir) {
174
+ // Determine filename from URL
175
+ const urlPath = new URL(url).pathname;
176
+ const filename = path.basename(urlPath) || "download.archive";
177
+ const archivePath = path.join(tempDir, filename);
178
+ return new Promise((resolve, reject) => {
179
+ const protocol = url.startsWith("https") ? https : http;
180
+ const request = protocol.get(url, async (response) => {
181
+ if (response.statusCode === undefined || response.statusCode >= 400) {
182
+ reject(new Error(`HTTP ${response.statusCode}: ${response.statusMessage}`));
183
+ return;
184
+ }
185
+ try {
186
+ const chunks = [];
187
+ response.on("data", (chunk) => {
188
+ chunks.push(chunk);
189
+ });
190
+ response.on("end", async () => {
191
+ try {
192
+ const buffer = Buffer.concat(chunks);
193
+ await fs.writeFile(archivePath, buffer);
194
+ resolve(archivePath);
195
+ }
196
+ catch (error) {
197
+ reject(error);
198
+ }
199
+ });
200
+ }
201
+ catch (error) {
202
+ reject(error);
203
+ }
204
+ });
205
+ request.on("error", (error) => {
206
+ reject(new WebSourceError(WebSourceErrorType.ARCHIVE_ERROR, `Failed to download archive from ${url}: ${error instanceof Error ? error.message : String(error)}`, { url }));
207
+ });
208
+ });
209
+ }
210
+ /**
211
+ * Extract a zip file to a directory using adm-zip
212
+ */
213
+ extractZip(zipPath, targetDir) {
214
+ try {
215
+ const zip = new AdmZip(zipPath);
216
+ zip.extractAllTo(targetDir, true);
217
+ }
218
+ catch (error) {
219
+ throw new WebSourceError(WebSourceErrorType.ARCHIVE_ERROR, `Failed to extract zip: ${error instanceof Error ? error.message : String(error)}`, { zipPath });
220
+ }
221
+ }
222
+ /**
223
+ * Extract a tar.gz file to a directory
224
+ */
225
+ async extractTarGz(tarGzPath, targetDir) {
226
+ try {
227
+ await tar.extract({
228
+ file: tarGzPath,
229
+ cwd: targetDir,
230
+ strip: 0,
231
+ });
232
+ }
233
+ catch (error) {
234
+ throw new WebSourceError(WebSourceErrorType.ARCHIVE_ERROR, `Failed to extract tar.gz: ${error instanceof Error ? error.message : String(error)}`, { tarGzPath });
235
+ }
236
+ }
237
+ /**
238
+ * If the extracted contents have a single root directory and no files at root,
239
+ * move that directory's contents one level up.
240
+ */
241
+ async flattenSingleRoot(extractDir) {
242
+ const entries = await fs.readdir(extractDir, { withFileTypes: true });
243
+ const directories = entries.filter((e) => e.isDirectory());
244
+ const files = entries.filter((e) => e.isFile());
245
+ if (directories.length === 1 && files.length === 0) {
246
+ const singleDir = path.join(extractDir, directories[0].name);
247
+ const innerEntries = await fs.readdir(singleDir);
248
+ // Move all contents up one level
249
+ for (const entry of innerEntries) {
250
+ const src = path.join(singleDir, entry);
251
+ const dest = path.join(extractDir, entry);
252
+ await fs.rename(src, dest);
253
+ }
254
+ // Remove the now-empty directory
255
+ await fs.rmdir(singleDir);
256
+ }
257
+ }
258
+ /**
259
+ * Extract content from extracted archive to target directory
260
+ */
261
+ async extractContent(sourceDir, targetDir, paths) {
262
+ await fs.mkdir(targetDir, { recursive: true });
263
+ const extractedFiles = [];
264
+ if (paths && paths.length > 0) {
265
+ // Extract only specified paths
266
+ for (const relPath of paths) {
267
+ const sourcePath = path.join(sourceDir, relPath);
268
+ const targetPath = path.join(targetDir, relPath);
269
+ try {
270
+ const stats = await fs.stat(sourcePath);
271
+ if (stats.isDirectory()) {
272
+ await this.copyDirectory(sourcePath, targetPath, extractedFiles);
273
+ }
274
+ else if (stats.isFile()) {
275
+ await fs.mkdir(path.dirname(targetPath), { recursive: true });
276
+ await fs.copyFile(sourcePath, targetPath);
277
+ extractedFiles.push(relPath);
278
+ }
279
+ }
280
+ catch (error) {
281
+ console.warn(`Warning: Could not extract ${relPath}: ${error instanceof Error ? error.message : String(error)}`);
282
+ }
283
+ }
284
+ }
285
+ else {
286
+ // Use smart filtering to extract only documentation files
287
+ await this.extractDocumentationFiles(sourceDir, targetDir, extractedFiles);
288
+ }
289
+ return extractedFiles;
290
+ }
291
+ /**
292
+ * Extract only documentation files from source directory
293
+ */
294
+ async extractDocumentationFiles(sourceDir, targetDir, extractedFiles) {
295
+ const allFiles = await this.scanAllFiles(sourceDir);
296
+ const docFiles = filterDocumentationFiles(allFiles);
297
+ for (const filePath of docFiles) {
298
+ const relativePath = path.relative(sourceDir, filePath);
299
+ const targetPath = path.join(targetDir, relativePath);
300
+ try {
301
+ await fs.mkdir(path.dirname(targetPath), { recursive: true });
302
+ await fs.copyFile(filePath, targetPath);
303
+ extractedFiles.push(relativePath);
304
+ }
305
+ catch (error) {
306
+ console.warn(`Warning: Could not copy ${relativePath}: ${error instanceof Error ? error.message : String(error)}`);
307
+ }
308
+ }
309
+ }
310
+ /**
311
+ * Copy directory recursively
312
+ */
313
+ async copyDirectory(source, target, fileList) {
314
+ await fs.mkdir(target, { recursive: true });
315
+ const items = await fs.readdir(source);
316
+ for (const item of items) {
317
+ const sourcePath = path.join(source, item);
318
+ const targetPath = path.join(target, item);
319
+ const stats = await fs.stat(sourcePath);
320
+ if (stats.isDirectory()) {
321
+ await this.copyDirectory(sourcePath, targetPath, fileList);
322
+ }
323
+ else {
324
+ await fs.copyFile(sourcePath, targetPath);
325
+ const relativePath = path.relative(target, targetPath);
326
+ fileList.push(relativePath);
327
+ }
328
+ }
329
+ }
330
+ /**
331
+ * Recursively scan all files in a directory
332
+ */
333
+ async scanAllFiles(dir) {
334
+ const files = [];
335
+ async function scan(currentDir) {
336
+ const items = await fs.readdir(currentDir);
337
+ for (const item of items) {
338
+ if (item === ".git")
339
+ continue;
340
+ const fullPath = path.join(currentDir, item);
341
+ const stat = await fs.stat(fullPath);
342
+ if (stat.isDirectory()) {
343
+ await scan(fullPath);
344
+ }
345
+ else if (stat.isFile()) {
346
+ files.push(fullPath);
347
+ }
348
+ }
349
+ }
350
+ await scan(dir);
351
+ return files;
352
+ }
353
+ /**
354
+ * Generate content hash for change detection
355
+ */
356
+ async generateContentHash(targetDir, files) {
357
+ const hash = crypto.createHash("sha256");
358
+ const sortedFiles = files.slice().sort();
359
+ for (const file of sortedFiles) {
360
+ const filePath = path.join(targetDir, file);
361
+ try {
362
+ const content = await fs.readFile(filePath);
363
+ hash.update(file);
364
+ hash.update(content);
365
+ }
366
+ catch (error) {
367
+ console.warn(`Warning: Could not hash ${file}: ${error instanceof Error ? error.message : String(error)}`);
368
+ }
369
+ }
370
+ return hash.digest("hex");
371
+ }
372
+ /**
373
+ * Create a temporary directory
374
+ */
375
+ async createTempDirectory() {
376
+ const tempDir = path.join(process.cwd(), ".tmp", `archive-extract-${Date.now()}-${Math.random().toString(36).slice(2)}`);
377
+ await fs.mkdir(tempDir, { recursive: true });
378
+ return tempDir;
379
+ }
380
+ /**
381
+ * Clean up temporary directory
382
+ */
383
+ async cleanupTempDirectory(tempDir) {
384
+ try {
385
+ await fs.rm(tempDir, { recursive: true, force: true });
386
+ }
387
+ catch (error) {
388
+ console.warn(`Warning: Could not clean up temp directory ${tempDir}: ${error instanceof Error ? error.message : String(error)}`);
389
+ }
390
+ }
391
+ }
@@ -0,0 +1,15 @@
1
+ /**
2
+ * Shared file filtering utilities for documentation content extraction (REQ-18)
3
+ */
4
+ /**
5
+ * Determine if a file is considered documentation content (REQ-18)
6
+ * @param filePath - Path to the file to check
7
+ * @returns True if file should be included as documentation
8
+ */
9
+ export declare function isDocumentationFile(filePath: string): boolean;
10
+ /**
11
+ * Filter list of files to only include documentation-relevant files (REQ-18)
12
+ * @param files - Array of file paths to filter
13
+ * @returns Array of file paths that are considered documentation
14
+ */
15
+ export declare function filterDocumentationFiles(files: string[]): string[];
@@ -0,0 +1,80 @@
1
+ /**
2
+ * Shared file filtering utilities for documentation content extraction (REQ-18)
3
+ */
4
+ import * as path from "node:path";
5
+ /**
6
+ * Determine if a file is considered documentation content (REQ-18)
7
+ * @param filePath - Path to the file to check
8
+ * @returns True if file should be included as documentation
9
+ */
10
+ export function isDocumentationFile(filePath) {
11
+ const filename = path.basename(filePath);
12
+ const extension = path.extname(filePath).toLowerCase();
13
+ const directory = path.dirname(filePath);
14
+ // Exclude project metadata files (REQ-18)
15
+ const metadataFiles = /^(CHANGELOG|LICENSE|CONTRIBUTING|AUTHORS|CODE_OF_CONDUCT)/i;
16
+ if (metadataFiles.test(filename)) {
17
+ return false;
18
+ }
19
+ // Normalize directory path for consistent matching (use forward slashes)
20
+ const normalizedDir = directory.split(path.sep).join("/");
21
+ const pathParts = normalizedDir.split("/");
22
+ // Exclude build, dependency, and development directories (REQ-18)
23
+ // Use exact directory name matching, not substring matching
24
+ const excludedDirs = [
25
+ "node_modules",
26
+ "vendor",
27
+ ".git",
28
+ "build",
29
+ "dist",
30
+ "target",
31
+ ".cache",
32
+ "__tests__",
33
+ "test",
34
+ "tests",
35
+ ".github",
36
+ ".vscode",
37
+ ".idea",
38
+ ];
39
+ // Check if any path segment matches excluded directories
40
+ for (const excludedDir of excludedDirs) {
41
+ if (pathParts.includes(excludedDir)) {
42
+ return false;
43
+ }
44
+ }
45
+ // Include README files anywhere (REQ-18)
46
+ if (/^README/i.test(filename)) {
47
+ return true;
48
+ }
49
+ // Include documentation file extensions anywhere, regardless of directory (REQ-18)
50
+ const docExtensions = [".md", ".mdx", ".rst", ".txt", ".adoc", ".asciidoc"];
51
+ if (docExtensions.includes(extension)) {
52
+ return true;
53
+ }
54
+ // Special case: examples/samples directory - include ALL file types (Issue #12)
55
+ // These directories contain code that demonstrates usage patterns
56
+ const isInExamples = /\b(examples?|samples?)\b/i.test(directory);
57
+ if (isInExamples) {
58
+ // In examples/samples, exclude only binary files
59
+ const excludedInExamples = [
60
+ ".exe",
61
+ ".bin",
62
+ ".so",
63
+ ".dll",
64
+ ".dylib",
65
+ ".a",
66
+ ".o",
67
+ ".obj",
68
+ ];
69
+ return !excludedInExamples.includes(extension);
70
+ }
71
+ return false;
72
+ }
73
+ /**
74
+ * Filter list of files to only include documentation-relevant files (REQ-18)
75
+ * @param files - Array of file paths to filter
76
+ * @returns Array of file paths that are considered documentation
77
+ */
78
+ export function filterDocumentationFiles(files) {
79
+ return files.filter((file) => isDocumentationFile(file));
80
+ }
@@ -51,18 +51,6 @@ export declare class GitRepoLoader extends ContentLoader {
51
51
  * Clean up temporary directory
52
52
  */
53
53
  private cleanupTempDirectory;
54
- /**
55
- * Filter list of files to only include documentation-relevant files (REQ-18)
56
- * @param files - Array of file paths to filter
57
- * @returns Array of file paths that are considered documentation
58
- */
59
- private filterDocumentationFiles;
60
- /**
61
- * Determine if a file is considered documentation content (REQ-18)
62
- * @param filePath - Path to the file to check
63
- * @returns True if file should be included as documentation
64
- */
65
- private isDocumentationFile;
66
54
  /**
67
55
  * Extract only documentation files from source directory (REQ-18)
68
56
  * @param sourceDir - Source directory to scan
@@ -7,6 +7,7 @@ import { execSync } from "node:child_process";
7
7
  import { ContentLoader } from "./loader.js";
8
8
  import { WebSourceType, WebSourceError, WebSourceErrorType, } from "../types.js";
9
9
  import * as crypto from "node:crypto";
10
+ import { filterDocumentationFiles } from "./file-filter.js";
10
11
  /**
11
12
  * Content loader for Git repositories (GitHub, GitLab, any Git repo)
12
13
  */
@@ -244,82 +245,6 @@ export class GitRepoLoader extends ContentLoader {
244
245
  console.warn(`Warning: Could not clean up temp directory ${tempDir}: ${error instanceof Error ? error.message : String(error)}`);
245
246
  }
246
247
  }
247
- /**
248
- * Filter list of files to only include documentation-relevant files (REQ-18)
249
- * @param files - Array of file paths to filter
250
- * @returns Array of file paths that are considered documentation
251
- */
252
- filterDocumentationFiles(files) {
253
- return files.filter((file) => this.isDocumentationFile(file));
254
- }
255
- /**
256
- * Determine if a file is considered documentation content (REQ-18)
257
- * @param filePath - Path to the file to check
258
- * @returns True if file should be included as documentation
259
- */
260
- isDocumentationFile(filePath) {
261
- const filename = path.basename(filePath);
262
- const extension = path.extname(filePath).toLowerCase();
263
- const directory = path.dirname(filePath);
264
- // Exclude project metadata files (REQ-18)
265
- const metadataFiles = /^(CHANGELOG|LICENSE|CONTRIBUTING|AUTHORS|CODE_OF_CONDUCT)/i;
266
- if (metadataFiles.test(filename)) {
267
- return false;
268
- }
269
- // Normalize directory path for consistent matching (use forward slashes)
270
- const normalizedDir = directory.split(path.sep).join("/");
271
- const pathParts = normalizedDir.split("/");
272
- // Exclude build, dependency, and development directories (REQ-18)
273
- // Use exact directory name matching, not substring matching
274
- const excludedDirs = [
275
- "node_modules",
276
- "vendor",
277
- ".git",
278
- "build",
279
- "dist",
280
- "target",
281
- ".cache",
282
- "__tests__",
283
- "test",
284
- "tests",
285
- ".github",
286
- ".vscode",
287
- ".idea",
288
- ];
289
- // Check if any path segment matches excluded directories
290
- for (const excludedDir of excludedDirs) {
291
- if (pathParts.includes(excludedDir)) {
292
- return false;
293
- }
294
- }
295
- // Include README files anywhere (REQ-18)
296
- if (/^README/i.test(filename)) {
297
- return true;
298
- }
299
- // Include documentation file extensions anywhere, regardless of directory (REQ-18)
300
- const docExtensions = [".md", ".mdx", ".rst", ".txt", ".adoc", ".asciidoc"];
301
- if (docExtensions.includes(extension)) {
302
- return true;
303
- }
304
- // Special case: examples/samples directory - include ALL file types (Issue #12)
305
- // These directories contain code that demonstrates usage patterns
306
- const isInExamples = /\b(examples?|samples?)\b/i.test(directory);
307
- if (isInExamples) {
308
- // In examples/samples, exclude only binary files
309
- const excludedInExamples = [
310
- ".exe",
311
- ".bin",
312
- ".so",
313
- ".dll",
314
- ".dylib",
315
- ".a",
316
- ".o",
317
- ".obj",
318
- ];
319
- return !excludedInExamples.includes(extension);
320
- }
321
- return false;
322
- }
323
248
  /**
324
249
  * Extract only documentation files from source directory (REQ-18)
325
250
  * @param sourceDir - Source directory to scan
@@ -330,7 +255,7 @@ export class GitRepoLoader extends ContentLoader {
330
255
  // First, scan all files in the repository
331
256
  const allFiles = await this.scanAllFiles(sourceDir);
332
257
  // Filter to only documentation files
333
- const docFiles = this.filterDocumentationFiles(allFiles);
258
+ const docFiles = filterDocumentationFiles(allFiles);
334
259
  // Copy the filtered files
335
260
  for (const filePath of docFiles) {
336
261
  const relativePath = path.relative(sourceDir, filePath);
@@ -3,7 +3,9 @@
3
3
  */
4
4
  export { ContentLoader } from "./loader.js";
5
5
  export { GitRepoLoader } from "./git-repo-loader.js";
6
+ export { ArchiveLoader } from "./archive-loader.js";
6
7
  export { DocumentationSiteLoader } from "./documentation-site-loader.js";
7
8
  export { ApiDocumentationLoader } from "./api-documentation-loader.js";
8
9
  export { ContentProcessor } from "./content-processor.js";
9
10
  export { MetadataManager } from "./metadata-manager.js";
11
+ export { isDocumentationFile, filterDocumentationFiles, } from "./file-filter.js";
@@ -3,7 +3,9 @@
3
3
  */
4
4
  export { ContentLoader } from "./loader.js";
5
5
  export { GitRepoLoader } from "./git-repo-loader.js";
6
+ export { ArchiveLoader } from "./archive-loader.js";
6
7
  export { DocumentationSiteLoader } from "./documentation-site-loader.js";
7
8
  export { ApiDocumentationLoader } from "./api-documentation-loader.js";
8
9
  export { ContentProcessor } from "./content-processor.js";
9
10
  export { MetadataManager } from "./metadata-manager.js";
11
+ export { isDocumentationFile, filterDocumentationFiles, } from "./file-filter.js";
@@ -22,7 +22,8 @@ export interface DocsetConfig {
22
22
  export declare enum WebSourceType {
23
23
  GIT_REPO = "git_repo",
24
24
  DOCUMENTATION_SITE = "documentation_site",
25
- API_DOCUMENTATION = "api_documentation"
25
+ API_DOCUMENTATION = "api_documentation",
26
+ ARCHIVE = "archive"
26
27
  }
27
28
  /**
28
29
  * Configuration for Git repository web sources
@@ -55,16 +56,23 @@ export interface ApiDocumentationOptions {
55
56
  /** Packages or modules to include */
56
57
  include_packages?: string[];
57
58
  }
59
+ /**
60
+ * Configuration for archive file web sources (zip, tar.gz, etc.)
61
+ */
62
+ export interface ArchiveOptions {
63
+ /** Specific paths to extract from the archive */
64
+ paths?: string[];
65
+ }
58
66
  /**
59
67
  * Configuration for a single web source
60
68
  */
61
69
  export interface WebSourceConfig {
62
- /** URL of the web source */
70
+ /** URL of the web source (or local path for archive sources) */
63
71
  url: string;
64
72
  /** Type of web source */
65
73
  type: WebSourceType;
66
74
  /** Type-specific options */
67
- options?: GitRepoOptions | DocumentationSiteOptions | ApiDocumentationOptions;
75
+ options?: GitRepoOptions | DocumentationSiteOptions | ApiDocumentationOptions | ArchiveOptions;
68
76
  }
69
77
  /**
70
78
  * Metadata for a single web source download
@@ -108,6 +116,7 @@ export declare const METADATA_FILENAME = ".agentic-metadata.json";
108
116
  export declare enum WebSourceErrorType {
109
117
  WEB_SOURCE_ERROR = "WEB_SOURCE_ERROR",
110
118
  GIT_REPO_ERROR = "GIT_REPO_ERROR",
119
+ ARCHIVE_ERROR = "ARCHIVE_ERROR",
111
120
  NOT_IMPLEMENTED = "NOT_IMPLEMENTED"
112
121
  }
113
122
  /**
@@ -9,6 +9,7 @@ export var WebSourceType;
9
9
  WebSourceType["GIT_REPO"] = "git_repo";
10
10
  WebSourceType["DOCUMENTATION_SITE"] = "documentation_site";
11
11
  WebSourceType["API_DOCUMENTATION"] = "api_documentation";
12
+ WebSourceType["ARCHIVE"] = "archive";
12
13
  })(WebSourceType || (WebSourceType = {}));
13
14
  /**
14
15
  * Metadata file name pattern
@@ -21,6 +22,7 @@ export var WebSourceErrorType;
21
22
  (function (WebSourceErrorType) {
22
23
  WebSourceErrorType["WEB_SOURCE_ERROR"] = "WEB_SOURCE_ERROR";
23
24
  WebSourceErrorType["GIT_REPO_ERROR"] = "GIT_REPO_ERROR";
25
+ WebSourceErrorType["ARCHIVE_ERROR"] = "ARCHIVE_ERROR";
24
26
  WebSourceErrorType["NOT_IMPLEMENTED"] = "NOT_IMPLEMENTED";
25
27
  })(WebSourceErrorType || (WebSourceErrorType = {}));
26
28
  /**
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "@codemcp/knowledge-content-loader",
3
- "version": "1.0.17",
3
+ "version": "1.1.0",
4
4
  "description": "Web content loading and metadata management for agentic knowledge system",
5
5
  "type": "module",
6
6
  "main": "dist/index.js",
@@ -29,11 +29,15 @@
29
29
  "typecheck": "tsc --noEmit"
30
30
  },
31
31
  "dependencies": {
32
- "simple-git": "^3.22.0"
32
+ "adm-zip": "0.5.16",
33
+ "simple-git": "^3.22.0",
34
+ "tar": "7.5.9"
33
35
  },
34
36
  "devDependencies": {
35
37
  "@eslint/js": "^9.34.0",
38
+ "@types/adm-zip": "0.5.7",
36
39
  "@types/node": "^24.3.0",
40
+ "@types/tar": "7.0.87",
37
41
  "eslint": "^9.34.0",
38
42
  "rimraf": "^6.0.1",
39
43
  "typescript": "^5.9.2",
@@ -207,6 +207,30 @@ function validateSource(source) {
207
207
  }
208
208
  return true;
209
209
  }
210
+ if (type === "archive") {
211
+ const hasPath = obj["path"] !== undefined &&
212
+ typeof obj["path"] === "string" &&
213
+ obj["path"].trim() !== "";
214
+ const hasUrl = obj["url"] !== undefined &&
215
+ typeof obj["url"] === "string" &&
216
+ obj["url"].trim() !== "";
217
+ // Must have exactly one of path or url
218
+ if (hasPath === hasUrl) {
219
+ return false;
220
+ }
221
+ // Optional paths field
222
+ if (obj["paths"] !== undefined) {
223
+ if (!Array.isArray(obj["paths"])) {
224
+ return false;
225
+ }
226
+ for (const path of obj["paths"]) {
227
+ if (typeof path !== "string" || path.trim() === "") {
228
+ return false;
229
+ }
230
+ }
231
+ }
232
+ return true;
233
+ }
210
234
  // Unknown source type
211
235
  return false;
212
236
  }
@@ -46,6 +46,10 @@ export function calculateLocalPath(docset, configPath) {
46
46
  // For git repos, use standardized path: .knowledge/docsets/{id}
47
47
  return join(configDir, "docsets", docset.id);
48
48
  }
49
+ if (primarySource.type === "archive") {
50
+ // For archive sources, use standardized path: .knowledge/docsets/{id}
51
+ return join(configDir, "docsets", docset.id);
52
+ }
49
53
  throw new Error(`Unsupported source type: ${primarySource.type}`);
50
54
  }
51
55
  catch (error) {
@@ -87,6 +91,10 @@ export async function calculateLocalPathWithSymlinks(docset, configPath) {
87
91
  // For git repos, use standardized path: .knowledge/docsets/{id}
88
92
  return join(configDir, "docsets", docset.id);
89
93
  }
94
+ if (primarySource.type === "archive") {
95
+ // For archive sources, use standardized path: .knowledge/docsets/{id}
96
+ return join(configDir, "docsets", docset.id);
97
+ }
90
98
  throw new Error(`Unsupported source type: ${primarySource.type}`);
91
99
  }
92
100
  /**
@@ -30,10 +30,22 @@ export interface GitRepoSourceConfig extends BaseSourceConfig {
30
30
  /** Specific paths to extract (optional) */
31
31
  paths?: string[];
32
32
  }
33
+ /**
34
+ * Archive file source configuration (supports zip, tar.gz, etc.)
35
+ */
36
+ export interface ArchiveSourceConfig extends BaseSourceConfig {
37
+ type: "archive";
38
+ /** Local path to archive file (mutually exclusive with url) */
39
+ path?: string;
40
+ /** Remote URL to download archive from (mutually exclusive with path) */
41
+ url?: string;
42
+ /** Specific paths to extract (optional) */
43
+ paths?: string[];
44
+ }
33
45
  /**
34
46
  * Union type for all source configurations
35
47
  */
36
- export type SourceConfig = LocalFolderSourceConfig | GitRepoSourceConfig;
48
+ export type SourceConfig = LocalFolderSourceConfig | GitRepoSourceConfig | ArchiveSourceConfig;
37
49
  /**
38
50
  * Configuration for a single docset
39
51
  */
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "@codemcp/knowledge-core",
3
- "version": "1.0.17",
3
+ "version": "1.1.0",
4
4
  "description": "Core functionality for agentic knowledge guidance system",
5
5
  "type": "module",
6
6
  "main": "dist/index.js",
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "@codemcp/knowledge",
3
- "version": "1.0.17",
3
+ "version": "1.1.0",
4
4
  "description": "MCP server implementation for agentic knowledge guidance system",
5
5
  "type": "module",
6
6
  "main": "dist/index.js",