@hej-ai/crawler 0.1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +51 -0
- package/dist/index.d.ts +11 -0
- package/dist/index.js +50 -0
- package/package.json +30 -0
package/README.md
ADDED
|
@@ -0,0 +1,51 @@
|
|
|
1
|
+
# @hej-ai/crawler
|
|
2
|
+
|
|
3
|
+
Scrape any webpage into clean markdown.
|
|
4
|
+
|
|
5
|
+
## Install
|
|
6
|
+
|
|
7
|
+
```bash
|
|
8
|
+
npm install @hej-ai/crawler
|
|
9
|
+
```
|
|
10
|
+
|
|
11
|
+
## Usage
|
|
12
|
+
|
|
13
|
+
```typescript
|
|
14
|
+
import {crawler} from "@hej-ai/crawler"
|
|
15
|
+
|
|
16
|
+
const {scrape} = crawler("your-api-key")
|
|
17
|
+
|
|
18
|
+
const page = await scrape("https://example.com")
|
|
19
|
+
console.log(page.title) // "Example Domain"
|
|
20
|
+
console.log(page.markdown) // clean markdown content
|
|
21
|
+
console.log(page.links) // internal links found on the page
|
|
22
|
+
```
|
|
23
|
+
|
|
24
|
+
### Scraping modes
|
|
25
|
+
|
|
26
|
+
```typescript
|
|
27
|
+
// Fast HTTP fetch + Readability extraction
|
|
28
|
+
await scrape("https://example.com", "static")
|
|
29
|
+
|
|
30
|
+
// Full headless browser render (default)
|
|
31
|
+
await scrape("https://example.com", "playwright")
|
|
32
|
+
|
|
33
|
+
// Tries static first, uses browser if content differs significantly
|
|
34
|
+
await scrape("https://example.com", "scout")
|
|
35
|
+
```
|
|
36
|
+
|
|
37
|
+
### Response
|
|
38
|
+
|
|
39
|
+
```typescript
|
|
40
|
+
{
|
|
41
|
+
url: string // final URL after redirects
|
|
42
|
+
title: string // page title
|
|
43
|
+
markdown: string // page content as markdown
|
|
44
|
+
links: string[] // internal links found
|
|
45
|
+
mode: string // mode used to produce the result
|
|
46
|
+
}
|
|
47
|
+
```
|
|
48
|
+
|
|
49
|
+
## License
|
|
50
|
+
|
|
51
|
+
MIT
|
package/dist/index.d.ts
ADDED
|
@@ -0,0 +1,11 @@
|
|
|
1
|
+
export type ScrapeMode = "static" | "playwright" | "scout";
|
|
2
|
+
export type ScrapedPage = {
|
|
3
|
+
url: string;
|
|
4
|
+
title: string;
|
|
5
|
+
markdown: string;
|
|
6
|
+
links: string[];
|
|
7
|
+
mode: ScrapeMode;
|
|
8
|
+
};
|
|
9
|
+
export declare function crawler(apiKey: string, apiUrl?: string): {
|
|
10
|
+
scrape: (url: string, mode?: ScrapeMode) => Promise<ScrapedPage>;
|
|
11
|
+
};
|
package/dist/index.js
ADDED
|
@@ -0,0 +1,50 @@
|
|
|
1
|
+
var __defProp = Object.defineProperty;
|
|
2
|
+
var __getOwnPropNames = Object.getOwnPropertyNames;
|
|
3
|
+
var __getOwnPropDesc = Object.getOwnPropertyDescriptor;
|
|
4
|
+
var __hasOwnProp = Object.prototype.hasOwnProperty;
|
|
5
|
+
var __moduleCache = /* @__PURE__ */ new WeakMap;
|
|
6
|
+
var __toCommonJS = (from) => {
|
|
7
|
+
var entry = __moduleCache.get(from), desc;
|
|
8
|
+
if (entry)
|
|
9
|
+
return entry;
|
|
10
|
+
entry = __defProp({}, "__esModule", { value: true });
|
|
11
|
+
if (from && typeof from === "object" || typeof from === "function")
|
|
12
|
+
__getOwnPropNames(from).map((key) => !__hasOwnProp.call(entry, key) && __defProp(entry, key, {
|
|
13
|
+
get: () => from[key],
|
|
14
|
+
enumerable: !(desc = __getOwnPropDesc(from, key)) || desc.enumerable
|
|
15
|
+
}));
|
|
16
|
+
__moduleCache.set(from, entry);
|
|
17
|
+
return entry;
|
|
18
|
+
};
|
|
19
|
+
var __export = (target, all) => {
|
|
20
|
+
for (var name in all)
|
|
21
|
+
__defProp(target, name, {
|
|
22
|
+
get: all[name],
|
|
23
|
+
enumerable: true,
|
|
24
|
+
configurable: true,
|
|
25
|
+
set: (newValue) => all[name] = () => newValue
|
|
26
|
+
});
|
|
27
|
+
};
|
|
28
|
+
|
|
29
|
+
// src/index.ts
|
|
30
|
+
var exports_src = {};
|
|
31
|
+
__export(exports_src, {
|
|
32
|
+
crawler: () => crawler
|
|
33
|
+
});
|
|
34
|
+
module.exports = __toCommonJS(exports_src);
|
|
35
|
+
var DEFAULT_API = "https://api.hej.chat/api/v1";
|
|
36
|
+
function crawler(apiKey, apiUrl = DEFAULT_API) {
|
|
37
|
+
async function scrape(url, mode) {
|
|
38
|
+
const res = await fetch(`${apiUrl}/scrape`, {
|
|
39
|
+
method: "POST",
|
|
40
|
+
headers: { Authorization: `Bearer ${apiKey}`, "Content-Type": "application/json" },
|
|
41
|
+
body: JSON.stringify({ url, mode })
|
|
42
|
+
});
|
|
43
|
+
if (!res.ok) {
|
|
44
|
+
const body = await res.json().catch(() => null);
|
|
45
|
+
throw new Error(body?.error ?? `Request failed with status ${res.status}`);
|
|
46
|
+
}
|
|
47
|
+
return res.json();
|
|
48
|
+
}
|
|
49
|
+
return { scrape };
|
|
50
|
+
}
|
package/package.json
ADDED
|
@@ -0,0 +1,30 @@
|
|
|
1
|
+
{
|
|
2
|
+
"name": "@hej-ai/crawler",
|
|
3
|
+
"version": "0.1.0",
|
|
4
|
+
"description": "Scrape any webpage into clean markdown",
|
|
5
|
+
"type": "module",
|
|
6
|
+
"main": "dist/index.cjs",
|
|
7
|
+
"module": "dist/index.js",
|
|
8
|
+
"types": "dist/index.d.ts",
|
|
9
|
+
"exports": {
|
|
10
|
+
".": {
|
|
11
|
+
"import": "./dist/index.js",
|
|
12
|
+
"require": "./dist/index.cjs",
|
|
13
|
+
"types": "./dist/index.d.ts"
|
|
14
|
+
}
|
|
15
|
+
},
|
|
16
|
+
"files": ["dist"],
|
|
17
|
+
"scripts": {
|
|
18
|
+
"build": "bun build src/index.ts --outdir dist --format esm --target node && bun build src/index.ts --outdir dist --format cjs --target node --outfile dist/index.cjs && tsc --emitDeclarationOnly",
|
|
19
|
+
"prepublishOnly": "bun run build"
|
|
20
|
+
},
|
|
21
|
+
"devDependencies": {
|
|
22
|
+
"typescript": "^5.0.0"
|
|
23
|
+
},
|
|
24
|
+
"license": "MIT",
|
|
25
|
+
"repository": {
|
|
26
|
+
"type": "git",
|
|
27
|
+
"url": "https://github.com/hej-ai/crawler"
|
|
28
|
+
},
|
|
29
|
+
"keywords": ["scraper", "crawler", "markdown", "web-scraping"]
|
|
30
|
+
}
|