docshark 0.1.8 → 0.1.12
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/CHANGELOG.md +28 -0
- package/README.md +1 -0
- package/dist/jobs/worker.js +1 -1
- package/dist/processor/extractor.d.ts +1 -1
- package/dist/processor/extractor.js +17 -1
- package/dist/scraper/fetcher.js +2 -1
- package/dist/types.d.ts +1 -0
- package/dist/version.d.ts +1 -1
- package/dist/version.js +1 -1
- package/package.json +11 -1
package/CHANGELOG.md
CHANGED
|
@@ -1,5 +1,33 @@
|
|
|
1
1
|
# Changelog
|
|
2
2
|
|
|
3
|
+
## [0.1.12](https://github.com/Michael-Obele/docshark/compare/v0.1.11...v0.1.12) (2026-03-12)
|
|
4
|
+
|
|
5
|
+
|
|
6
|
+
### 🐛 Bug Fixes
|
|
7
|
+
|
|
8
|
+
* **release:** update Node.js version to 24 and set registry URL ([2887a29](https://github.com/Michael-Obele/docshark/commit/2887a2920ab61adf6eb4bf4aa6b6d86143ffd34f))
|
|
9
|
+
|
|
10
|
+
## [0.1.11](https://github.com/Michael-Obele/docshark/compare/v0.1.10...v0.1.11) (2026-03-11)
|
|
11
|
+
|
|
12
|
+
|
|
13
|
+
### ✨ Features
|
|
14
|
+
|
|
15
|
+
* Directly process raw Markdown and plain text files by checking content type and URL extension. ([a4d66b8](https://github.com/Michael-Obele/docshark/commit/a4d66b8cdd6342645b3cab292ef82c133c9ae52c))
|
|
16
|
+
|
|
17
|
+
## [0.1.10](https://github.com/Michael-Obele/docshark/compare/v0.1.9...v0.1.10) (2026-03-11)
|
|
18
|
+
|
|
19
|
+
|
|
20
|
+
### 📚 Documentation
|
|
21
|
+
|
|
22
|
+
* Add NPM version badge to README. ([ae9c6ca](https://github.com/Michael-Obele/docshark/commit/ae9c6ca71dbd844de6b065a0e47021c7d1e3ec72))
|
|
23
|
+
|
|
24
|
+
## [0.1.9](https://github.com/Michael-Obele/docshark/compare/v0.1.8...v0.1.9) (2026-03-11)
|
|
25
|
+
|
|
26
|
+
|
|
27
|
+
### ✨ Features
|
|
28
|
+
|
|
29
|
+
* Add NPM provenance publishing to the release workflow and set the release-please changelog type to default. ([7c6ed54](https://github.com/Michael-Obele/docshark/commit/7c6ed54ca8e51a3b73abf64cc775352937c9c1cf))
|
|
30
|
+
|
|
3
31
|
## 0.1.8 (2026-03-11)
|
|
4
32
|
|
|
5
33
|
**Full Changelog**: https://github.com/Michael-Obele/docshark/compare/v0.1.7...v0.1.8
|
package/README.md
CHANGED
|
@@ -1,6 +1,7 @@
|
|
|
1
1
|
# 🦈 DocShark
|
|
2
2
|
|
|
3
3
|
[](https://bun.sh/)
|
|
4
|
+
[](https://www.npmjs.com/package/docshark)
|
|
4
5
|
[](https://modelcontextprotocol.io/)
|
|
5
6
|
[](https://github.com/Michael-Obele/docshark/releases)
|
|
6
7
|
[](https://opensource.org/licenses/MIT)
|
package/dist/jobs/worker.js
CHANGED
|
@@ -41,7 +41,7 @@ export class CrawlWorker {
|
|
|
41
41
|
await rateLimiter.wait();
|
|
42
42
|
const result = await fetchPage(url, config.renderer);
|
|
43
43
|
// Extract content + convert to markdown
|
|
44
|
-
const { markdown, title, headings } = extractAndConvert(result.html, url);
|
|
44
|
+
const { markdown, title, headings } = extractAndConvert(result.html, url, result.contentType);
|
|
45
45
|
if (!markdown || markdown.length < 50) {
|
|
46
46
|
crawled++;
|
|
47
47
|
continue; // Skip essentially empty pages
|
|
@@ -25,7 +25,23 @@ turndown.addRule('removeImages', {
|
|
|
25
25
|
filter: 'img',
|
|
26
26
|
replacement: () => '',
|
|
27
27
|
});
|
|
28
|
-
export function extractAndConvert(html, url) {
|
|
28
|
+
export function extractAndConvert(html, url, contentType = 'text/html') {
|
|
29
|
+
const isMarkdown = url.endsWith('.md') || url.endsWith('.txt') || contentType.includes('text/plain') || contentType.includes('text/markdown');
|
|
30
|
+
if (isMarkdown && !html.trim().startsWith('<!DOCTYPE') && !html.trim().startsWith('<html')) {
|
|
31
|
+
const markdown = html.trim();
|
|
32
|
+
const titleMatch = markdown.match(/^#\s+(.+)$/m);
|
|
33
|
+
let title = titleMatch ? titleMatch[1].trim() : url.split('/').pop() || 'Untitled';
|
|
34
|
+
if (title.endsWith('.txt') || title.endsWith('.md')) {
|
|
35
|
+
title = title.slice(0, -4);
|
|
36
|
+
}
|
|
37
|
+
const headings = [];
|
|
38
|
+
const headingRegex = /^(#{1,6})\s+(.+)$/gm;
|
|
39
|
+
let match;
|
|
40
|
+
while ((match = headingRegex.exec(markdown)) !== null) {
|
|
41
|
+
headings.push({ level: match[1].length, text: match[2].trim() });
|
|
42
|
+
}
|
|
43
|
+
return { markdown, title, headings };
|
|
44
|
+
}
|
|
29
45
|
const { document } = parseHTML(html);
|
|
30
46
|
// Set the document URL for Readability to resolve relative links
|
|
31
47
|
if (url) {
|
package/dist/scraper/fetcher.js
CHANGED
|
@@ -17,7 +17,7 @@ export async function fetchPage(url, renderer = 'auto') {
|
|
|
17
17
|
return result;
|
|
18
18
|
}
|
|
19
19
|
// Auto mode: check if content is too short (possibly JS-rendered)
|
|
20
|
-
const { markdown } = extractAndConvert(result.html, url);
|
|
20
|
+
const { markdown } = extractAndConvert(result.html, url, result.contentType);
|
|
21
21
|
if (markdown.length >= MIN_CONTENT_LENGTH) {
|
|
22
22
|
return result;
|
|
23
23
|
}
|
|
@@ -52,6 +52,7 @@ async function fetchWithRetry(url, retries = MAX_RETRIES) {
|
|
|
52
52
|
status: response.status,
|
|
53
53
|
etag: response.headers.get('etag'),
|
|
54
54
|
lastModified: response.headers.get('last-modified'),
|
|
55
|
+
contentType: response.headers.get('content-type') || undefined,
|
|
55
56
|
};
|
|
56
57
|
}
|
|
57
58
|
catch (err) {
|
package/dist/types.d.ts
CHANGED
package/dist/version.d.ts
CHANGED
|
@@ -1 +1 @@
|
|
|
1
|
-
export declare const VERSION = "0.1.
|
|
1
|
+
export declare const VERSION = "0.1.12";
|
package/dist/version.js
CHANGED
|
@@ -1,2 +1,2 @@
|
|
|
1
1
|
// This file is automatically updated by release-please
|
|
2
|
-
export const VERSION = '0.1.
|
|
2
|
+
export const VERSION = '0.1.12'; // x-release-please-version
|
package/package.json
CHANGED
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
{
|
|
2
2
|
"name": "docshark",
|
|
3
|
-
"version": "0.1.
|
|
3
|
+
"version": "0.1.12",
|
|
4
4
|
"description": "🦈 Documentation MCP Server — scrape, index, and search any doc website",
|
|
5
5
|
"type": "module",
|
|
6
6
|
"main": "./dist/index.js",
|
|
@@ -21,6 +21,16 @@
|
|
|
21
21
|
"LICENSE",
|
|
22
22
|
"CHANGELOG.md"
|
|
23
23
|
],
|
|
24
|
+
"author": "Michael Obele",
|
|
25
|
+
"license": "MIT",
|
|
26
|
+
"repository": {
|
|
27
|
+
"type": "git",
|
|
28
|
+
"url": "git+https://github.com/Michael-Obele/docshark.git"
|
|
29
|
+
},
|
|
30
|
+
"bugs": {
|
|
31
|
+
"url": "https://github.com/Michael-Obele/docshark/issues"
|
|
32
|
+
},
|
|
33
|
+
"homepage": "https://github.com/Michael-Obele/docshark#readme",
|
|
24
34
|
"scripts": {
|
|
25
35
|
"start": "bun run src/cli.ts start",
|
|
26
36
|
"dev": "bun run --watch src/cli.ts start",
|