@hej-ai/crawler 0.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/README.md ADDED
@@ -0,0 +1,51 @@
1
+ # @hej-ai/crawler
2
+
3
+ Scrape any webpage into clean markdown.
4
+
5
+ ## Install
6
+
7
+ ```bash
8
+ npm install @hej-ai/crawler
9
+ ```
10
+
11
+ ## Usage
12
+
13
+ ```typescript
14
+ import {crawler} from "@hej-ai/crawler"
15
+
16
+ const {scrape} = crawler("your-api-key")
17
+
18
+ const page = await scrape("https://example.com")
19
+ console.log(page.title) // "Example Domain"
20
+ console.log(page.markdown) // clean markdown content
21
+ console.log(page.links) // internal links found on the page
22
+ ```
23
+
24
+ ### Scraping modes
25
+
26
+ ```typescript
27
+ // Fast HTTP fetch + Readability extraction
28
+ await scrape("https://example.com", "static")
29
+
30
+ // Full headless browser render (default)
31
+ await scrape("https://example.com", "playwright")
32
+
33
+ // Tries static first, uses browser if content differs significantly
34
+ await scrape("https://example.com", "scout")
35
+ ```
36
+
37
+ ### Response
38
+
39
+ ```typescript
40
+ {
41
+ url: string // final URL after redirects
42
+ title: string // page title
43
+ markdown: string // page content as markdown
44
+ links: string[] // internal links found
45
+ mode: string // mode used to produce the result
46
+ }
47
+ ```
48
+
49
+ ## License
50
+
51
+ MIT
@@ -0,0 +1,11 @@
1
+ export type ScrapeMode = "static" | "playwright" | "scout";
2
+ export type ScrapedPage = {
3
+ url: string;
4
+ title: string;
5
+ markdown: string;
6
+ links: string[];
7
+ mode: ScrapeMode;
8
+ };
9
+ export declare function crawler(apiKey: string, apiUrl?: string): {
10
+ scrape: (url: string, mode?: ScrapeMode) => Promise<ScrapedPage>;
11
+ };
package/dist/index.js ADDED
@@ -0,0 +1,50 @@
1
+ var __defProp = Object.defineProperty;
2
+ var __getOwnPropNames = Object.getOwnPropertyNames;
3
+ var __getOwnPropDesc = Object.getOwnPropertyDescriptor;
4
+ var __hasOwnProp = Object.prototype.hasOwnProperty;
5
+ var __moduleCache = /* @__PURE__ */ new WeakMap;
6
+ var __toCommonJS = (from) => {
7
+ var entry = __moduleCache.get(from), desc;
8
+ if (entry)
9
+ return entry;
10
+ entry = __defProp({}, "__esModule", { value: true });
11
+ if (from && typeof from === "object" || typeof from === "function")
12
+ __getOwnPropNames(from).map((key) => !__hasOwnProp.call(entry, key) && __defProp(entry, key, {
13
+ get: () => from[key],
14
+ enumerable: !(desc = __getOwnPropDesc(from, key)) || desc.enumerable
15
+ }));
16
+ __moduleCache.set(from, entry);
17
+ return entry;
18
+ };
19
+ var __export = (target, all) => {
20
+ for (var name in all)
21
+ __defProp(target, name, {
22
+ get: all[name],
23
+ enumerable: true,
24
+ configurable: true,
25
+ set: (newValue) => all[name] = () => newValue
26
+ });
27
+ };
28
+
29
+ // src/index.ts
30
+ var exports_src = {};
31
+ __export(exports_src, {
32
+ crawler: () => crawler
33
+ });
34
+ module.exports = __toCommonJS(exports_src);
35
+ var DEFAULT_API = "https://api.hej.chat/api/v1";
36
+ function crawler(apiKey, apiUrl = DEFAULT_API) {
37
+ async function scrape(url, mode) {
38
+ const res = await fetch(`${apiUrl}/scrape`, {
39
+ method: "POST",
40
+ headers: { Authorization: `Bearer ${apiKey}`, "Content-Type": "application/json" },
41
+ body: JSON.stringify({ url, mode })
42
+ });
43
+ if (!res.ok) {
44
+ const body = await res.json().catch(() => null);
45
+ throw new Error(body?.error ?? `Request failed with status ${res.status}`);
46
+ }
47
+ return res.json();
48
+ }
49
+ return { scrape };
50
+ }
package/package.json ADDED
@@ -0,0 +1,30 @@
1
+ {
2
+ "name": "@hej-ai/crawler",
3
+ "version": "0.1.0",
4
+ "description": "Scrape any webpage into clean markdown",
5
+ "type": "module",
6
+ "main": "dist/index.cjs",
7
+ "module": "dist/index.js",
8
+ "types": "dist/index.d.ts",
9
+ "exports": {
10
+ ".": {
11
+ "import": "./dist/index.js",
12
+ "require": "./dist/index.cjs",
13
+ "types": "./dist/index.d.ts"
14
+ }
15
+ },
16
+ "files": ["dist"],
17
+ "scripts": {
18
+ "build": "bun build src/index.ts --outdir dist --format esm --target node && bun build src/index.ts --outdir dist --format cjs --target node --outfile dist/index.cjs && tsc --emitDeclarationOnly",
19
+ "prepublishOnly": "bun run build"
20
+ },
21
+ "devDependencies": {
22
+ "typescript": "^5.0.0"
23
+ },
24
+ "license": "MIT",
25
+ "repository": {
26
+ "type": "git",
27
+ "url": "https://github.com/hej-ai/crawler"
28
+ },
29
+ "keywords": ["scraper", "crawler", "markdown", "web-scraping"]
30
+ }