@nanocollective/get-md 1.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (52) hide show
  1. package/LICENSE +36 -0
  2. package/README.md +205 -0
  3. package/bin/get-md.js +4 -0
  4. package/dist/cli.d.ts +3 -0
  5. package/dist/cli.d.ts.map +1 -0
  6. package/dist/cli.js +91 -0
  7. package/dist/cli.js.map +1 -0
  8. package/dist/cli.spec.d.ts +2 -0
  9. package/dist/cli.spec.d.ts.map +1 -0
  10. package/dist/cli.spec.js +278 -0
  11. package/dist/cli.spec.js.map +1 -0
  12. package/dist/config.d.ts +5 -0
  13. package/dist/config.d.ts.map +1 -0
  14. package/dist/config.js +6 -0
  15. package/dist/config.js.map +1 -0
  16. package/dist/extractors/metadata-extractor.d.ts +6 -0
  17. package/dist/extractors/metadata-extractor.d.ts.map +1 -0
  18. package/dist/extractors/metadata-extractor.js +131 -0
  19. package/dist/extractors/metadata-extractor.js.map +1 -0
  20. package/dist/index.d.ts +44 -0
  21. package/dist/index.d.ts.map +1 -0
  22. package/dist/index.js +70 -0
  23. package/dist/index.js.map +1 -0
  24. package/dist/optimizers/html-cleaner.d.ts +12 -0
  25. package/dist/optimizers/html-cleaner.d.ts.map +1 -0
  26. package/dist/optimizers/html-cleaner.js +228 -0
  27. package/dist/optimizers/html-cleaner.js.map +1 -0
  28. package/dist/optimizers/llm-formatter.d.ts +8 -0
  29. package/dist/optimizers/llm-formatter.d.ts.map +1 -0
  30. package/dist/optimizers/llm-formatter.js +94 -0
  31. package/dist/optimizers/llm-formatter.js.map +1 -0
  32. package/dist/optimizers/structure-enhancer.d.ts +8 -0
  33. package/dist/optimizers/structure-enhancer.d.ts.map +1 -0
  34. package/dist/optimizers/structure-enhancer.js +92 -0
  35. package/dist/optimizers/structure-enhancer.js.map +1 -0
  36. package/dist/parsers/markdown-parser.d.ts +16 -0
  37. package/dist/parsers/markdown-parser.d.ts.map +1 -0
  38. package/dist/parsers/markdown-parser.js +369 -0
  39. package/dist/parsers/markdown-parser.js.map +1 -0
  40. package/dist/types.d.ts +115 -0
  41. package/dist/types.d.ts.map +1 -0
  42. package/dist/types.js +3 -0
  43. package/dist/types.js.map +1 -0
  44. package/dist/utils/url-fetcher.d.ts +10 -0
  45. package/dist/utils/url-fetcher.d.ts.map +1 -0
  46. package/dist/utils/url-fetcher.js +54 -0
  47. package/dist/utils/url-fetcher.js.map +1 -0
  48. package/dist/utils/validators.d.ts +5 -0
  49. package/dist/utils/validators.d.ts.map +1 -0
  50. package/dist/utils/validators.js +23 -0
  51. package/dist/utils/validators.js.map +1 -0
  52. package/package.json +104 -0
@@ -0,0 +1,54 @@
1
+ // src/utils/url-fetcher.ts
2
+ import { DEFAULT_USER_AGENT, DEFAULT_FETCH_TIMEOUT } from "../config.js";
3
+ /**
4
+ * Fetch HTML from a URL with timeout and redirect handling
5
+ */
6
+ export async function fetchUrl(url, options = {}) {
7
+ const timeout = options.timeout ?? DEFAULT_FETCH_TIMEOUT;
8
+ const userAgent = options.userAgent ?? DEFAULT_USER_AGENT;
9
+ const followRedirects = options.followRedirects ?? true;
10
+ const headers = {
11
+ "User-Agent": userAgent,
12
+ Accept: "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8",
13
+ "Accept-Language": "en-US,en;q=0.5",
14
+ "Accept-Encoding": "gzip, deflate, br",
15
+ ...options.headers,
16
+ };
17
+ try {
18
+ const controller = new AbortController();
19
+ const timeoutId = setTimeout(() => controller.abort(), timeout);
20
+ const response = await fetch(url, {
21
+ headers,
22
+ signal: controller.signal,
23
+ redirect: followRedirects ? "follow" : "manual",
24
+ });
25
+ clearTimeout(timeoutId);
26
+ if (!response.ok) {
27
+ throw new Error(`HTTP ${response.status}: ${response.statusText}`);
28
+ }
29
+ const html = await response.text();
30
+ return html;
31
+ }
32
+ catch (error) {
33
+ if (error instanceof Error) {
34
+ if (error.name === "AbortError") {
35
+ throw new Error(`Request timeout after ${timeout}ms`);
36
+ }
37
+ throw new Error(`Failed to fetch URL: ${error.message}`);
38
+ }
39
+ throw error;
40
+ }
41
+ }
42
+ /**
43
+ * Validate if a string is a valid URL
44
+ */
45
+ export function isValidUrl(urlString) {
46
+ try {
47
+ const url = new URL(urlString);
48
+ return url.protocol === "http:" || url.protocol === "https:";
49
+ }
50
+ catch {
51
+ return false;
52
+ }
53
+ }
54
+ //# sourceMappingURL=url-fetcher.js.map
@@ -0,0 +1 @@
1
+ {"version":3,"file":"url-fetcher.js","sourceRoot":"","sources":["../../src/utils/url-fetcher.ts"],"names":[],"mappings":"AAAA,2BAA2B;AAG3B,OAAO,EAAE,kBAAkB,EAAE,qBAAqB,EAAE,MAAM,cAAc,CAAC;AAEzE;;GAEG;AACH,MAAM,CAAC,KAAK,UAAU,QAAQ,CAC5B,GAAW,EACX,UAAwB,EAAE;IAE1B,MAAM,OAAO,GAAG,OAAO,CAAC,OAAO,IAAI,qBAAqB,CAAC;IACzD,MAAM,SAAS,GAAG,OAAO,CAAC,SAAS,IAAI,kBAAkB,CAAC;IAC1D,MAAM,eAAe,GAAG,OAAO,CAAC,eAAe,IAAI,IAAI,CAAC;IAExD,MAAM,OAAO,GAA2B;QACtC,YAAY,EAAE,SAAS;QACvB,MAAM,EAAE,iEAAiE;QACzE,iBAAiB,EAAE,gBAAgB;QACnC,iBAAiB,EAAE,mBAAmB;QACtC,GAAG,OAAO,CAAC,OAAO;KACnB,CAAC;IAEF,IAAI,CAAC;QACH,MAAM,UAAU,GAAG,IAAI,eAAe,EAAE,CAAC;QACzC,MAAM,SAAS,GAAG,UAAU,CAAC,GAAG,EAAE,CAAC,UAAU,CAAC,KAAK,EAAE,EAAE,OAAO,CAAC,CAAC;QAEhE,MAAM,QAAQ,GAAG,MAAM,KAAK,CAAC,GAAG,EAAE;YAChC,OAAO;YACP,MAAM,EAAE,UAAU,CAAC,MAAM;YACzB,QAAQ,EAAE,eAAe,CAAC,CAAC,CAAC,QAAQ,CAAC,CAAC,CAAC,QAAQ;SAChD,CAAC,CAAC;QAEH,YAAY,CAAC,SAAS,CAAC,CAAC;QAExB,IAAI,CAAC,QAAQ,CAAC,EAAE,EAAE,CAAC;YACjB,MAAM,IAAI,KAAK,CAAC,QAAQ,QAAQ,CAAC,MAAM,KAAK,QAAQ,CAAC,UAAU,EAAE,CAAC,CAAC;QACrE,CAAC;QAED,MAAM,IAAI,GAAG,MAAM,QAAQ,CAAC,IAAI,EAAE,CAAC;QACnC,OAAO,IAAI,CAAC;IACd,CAAC;IAAC,OAAO,KAAK,EAAE,CAAC;QACf,IAAI,KAAK,YAAY,KAAK,EAAE,CAAC;YAC3B,IAAI,KAAK,CAAC,IAAI,KAAK,YAAY,EAAE,CAAC;gBAChC,MAAM,IAAI,KAAK,CAAC,yBAAyB,OAAO,IAAI,CAAC,CAAC;YACxD,CAAC;YACD,MAAM,IAAI,KAAK,CAAC,wBAAwB,KAAK,CAAC,OAAO,EAAE,CAAC,CAAC;QAC3D,CAAC;QACD,MAAM,KAAK,CAAC;IACd,CAAC;AACH,CAAC;AAED;;GAEG;AACH,MAAM,UAAU,UAAU,CAAC,SAAiB;IAC1C,IAAI,CAAC;QACH,MAAM,GAAG,GAAG,IAAI,GAAG,CAAC,SAAS,CAAC,CAAC;QAC/B,OAAO,GAAG,CAAC,QAAQ,KAAK,OAAO,IAAI,GAAG,CAAC,QAAQ,KAAK,QAAQ,CAAC;IAC/D,CAAC;IAAC,MAAM,CAAC;QACP,OAAO,KAAK,CAAC;IACf,CAAC;AACH,CAAC"}
@@ -0,0 +1,5 @@
1
+ /**
2
+ * Validate if HTML contains extractable content
3
+ */
4
+ export declare function hasContent(html: string): boolean;
5
+ //# sourceMappingURL=validators.d.ts.map
@@ -0,0 +1 @@
1
+ {"version":3,"file":"validators.d.ts","sourceRoot":"","sources":["../../src/utils/validators.ts"],"names":[],"mappings":"AAIA;;GAEG;AACH,wBAAgB,UAAU,CAAC,IAAI,EAAE,MAAM,GAAG,OAAO,CAmBhD"}
@@ -0,0 +1,23 @@
1
+ // src/utils/validators.ts
2
+ import * as cheerio from "cheerio";
3
+ /**
4
+ * Validate if HTML contains extractable content
5
+ */
6
+ export function hasContent(html) {
7
+ if (!html || typeof html !== "string") {
8
+ return false;
9
+ }
10
+ try {
11
+ const $ = cheerio.load(html);
12
+ // Remove scripts, styles, and common noise
13
+ $("script, style, nav, header, footer").remove();
14
+ // Get text content
15
+ const text = $("body").text().trim();
16
+ // Consider it has content if there's at least 100 characters of text
17
+ return text.length >= 100;
18
+ }
19
+ catch {
20
+ return false;
21
+ }
22
+ }
23
+ //# sourceMappingURL=validators.js.map
@@ -0,0 +1 @@
1
+ {"version":3,"file":"validators.js","sourceRoot":"","sources":["../../src/utils/validators.ts"],"names":[],"mappings":"AAAA,0BAA0B;AAE1B,OAAO,KAAK,OAAO,MAAM,SAAS,CAAC;AAEnC;;GAEG;AACH,MAAM,UAAU,UAAU,CAAC,IAAY;IACrC,IAAI,CAAC,IAAI,IAAI,OAAO,IAAI,KAAK,QAAQ,EAAE,CAAC;QACtC,OAAO,KAAK,CAAC;IACf,CAAC;IAED,IAAI,CAAC;QACH,MAAM,CAAC,GAAG,OAAO,CAAC,IAAI,CAAC,IAAI,CAAC,CAAC;QAE7B,2CAA2C;QAC3C,CAAC,CAAC,oCAAoC,CAAC,CAAC,MAAM,EAAE,CAAC;QAEjD,mBAAmB;QACnB,MAAM,IAAI,GAAG,CAAC,CAAC,MAAM,CAAC,CAAC,IAAI,EAAE,CAAC,IAAI,EAAE,CAAC;QAErC,qEAAqE;QACrE,OAAO,IAAI,CAAC,MAAM,IAAI,GAAG,CAAC;IAC5B,CAAC;IAAC,MAAM,CAAC;QACP,OAAO,KAAK,CAAC;IACf,CAAC;AACH,CAAC"}
package/package.json ADDED
@@ -0,0 +1,104 @@
1
+ {
2
+ "name": "@nanocollective/get-md",
3
+ "version": "1.0.0",
4
+ "description": "Fast HTML to Markdown converter optimized for LLM consumption",
5
+ "type": "module",
6
+ "main": "./dist/index.js",
7
+ "types": "./dist/index.d.ts",
8
+ "bin": {
9
+ "getmd": "./bin/get-md.js"
10
+ },
11
+ "exports": {
12
+ ".": {
13
+ "types": "./dist/index.d.ts",
14
+ "import": "./dist/index.js"
15
+ }
16
+ },
17
+ "files": [
18
+ "dist",
19
+ "bin",
20
+ "README.md",
21
+ "LICENSE"
22
+ ],
23
+ "engines": {
24
+ "node": ">=18"
25
+ },
26
+ "scripts": {
27
+ "build": "tsc && chmod +x bin/get-md.js",
28
+ "dev": "tsc --watch",
29
+ "test:all": "./scripts/test.sh",
30
+ "test:ava": "ava",
31
+ "test:ava:coverage": "c8 ava",
32
+ "test:format": "prettier --check .",
33
+ "test:types": "tsc --noEmit",
34
+ "test:lint": "eslint .",
35
+ "test:lint:fix": "eslint . --fix",
36
+ "test:knip": "knip",
37
+ "format": "prettier --write .",
38
+ "prepublishOnly": "pnpm run build && pnpm run test:all"
39
+ },
40
+ "keywords": [
41
+ "html",
42
+ "markdown",
43
+ "json",
44
+ "converter",
45
+ "parser",
46
+ "llm",
47
+ "readability",
48
+ "turndown",
49
+ "cheerio",
50
+ "fast",
51
+ "local"
52
+ ],
53
+ "dependencies": {
54
+ "@mozilla/readability": "^0.6.0",
55
+ "ajv": "^8.17.1",
56
+ "cheerio": "^1.1.2",
57
+ "commander": "^14.0.2",
58
+ "jsdom": "^24.1.3",
59
+ "turndown": "^7.2.2",
60
+ "turndown-plugin-gfm": "^1.0.2"
61
+ },
62
+ "repository": {
63
+ "type": "git",
64
+ "url": "https://github.com/nano-collective/get-md.git"
65
+ },
66
+ "license": "MIT",
67
+ "devDependencies": {
68
+ "@ava/typescript": "^6.0.0",
69
+ "@eslint/js": "^9.38.0",
70
+ "@types/jsdom": "^27.0.0",
71
+ "@types/node": "^24.9.1",
72
+ "@types/turndown": "^5.0.6",
73
+ "@typescript-eslint/eslint-plugin": "^8.46.2",
74
+ "@typescript-eslint/parser": "^8.46.2",
75
+ "ava": "^6.4.1",
76
+ "c8": "^10.1.3",
77
+ "eslint": "^9.38.0",
78
+ "globals": "^16.4.0",
79
+ "knip": "^5.66.3",
80
+ "prettier": "^3.6.2",
81
+ "tsx": "^4.20.6",
82
+ "typescript": "^5.9.3",
83
+ "typescript-eslint": "^8.46.2"
84
+ },
85
+ "ava": {
86
+ "extensions": {
87
+ "ts": "module"
88
+ },
89
+ "nodeArguments": [
90
+ "--import=tsx",
91
+ "--no-warnings"
92
+ ],
93
+ "files": [
94
+ "src/**/*.spec.ts",
95
+ "!src/**/*-helpers.ts",
96
+ "!src/**/test-helpers.ts"
97
+ ],
98
+ "workerThreads": false,
99
+ "serial": true,
100
+ "environmentVariables": {
101
+ "NODE_ENV": "test"
102
+ }
103
+ }
104
+ }