ara-generate 1.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (3) hide show
  1. package/README.md +44 -0
  2. package/index.js +304 -0
  3. package/package.json +28 -0
package/README.md ADDED
@@ -0,0 +1,44 @@
1
+ # ara-generate
2
+
3
+ Generates a basic ARA manifest from existing website metadata. Scans your site's HTML, meta tags, JSON-LD, robots.txt, and sitemap.xml to auto-generate a Level 1 ARA manifest.
4
+
5
+ ## Quick Start
6
+
7
+ ```bash
8
+ # Generate to stdout
9
+ npx ara-generate https://example.com
10
+
11
+ # Save to file
12
+ npx ara-generate https://example.com --output .well-known/ara/manifest.json
13
+ ```
14
+
15
+ ## What It Extracts
16
+
17
+ - **Title & description** from `<meta>`, OpenGraph, and `<title>` tags
18
+ - **JSON-LD / Schema.org** structured data
19
+ - **Language** from `<html lang="...">`
20
+ - **Site type** inferred from content and JSON-LD types
21
+ - **robots.txt** and **sitemap.xml** presence
22
+
23
+ ## Output
24
+
25
+ A Level 1 ARA manifest (`manifest.json`) with:
26
+ - `identity` (name, type, description, locale, contact)
27
+ - `content_map` (basic resource listing)
28
+ - `capabilities` (placeholder for protocols)
29
+ - `policies` (default open access with rate limits)
30
+ - `meta` (generation timestamp, source URL)
31
+
32
+ The generated manifest is a starting point. Enrich it with:
33
+ - **Layer 2**: Add `schemas/*.json` for semantic data structure
34
+ - **Layer 3**: Add `actions.json` for agent interactions
35
+
36
+ ## Links
37
+
38
+ - **Spec**: https://ara-standard.org
39
+ - **GitHub**: https://github.com/aka9871/ara-standard
40
+ - **Validator**: `npx ara-validate https://yoursite.com`
41
+
42
+ ## License
43
+
44
+ MIT
package/index.js ADDED
@@ -0,0 +1,304 @@
1
+ #!/usr/bin/env node
2
+
3
+ /**
4
+ * ARA Generator — Generates a basic ARA manifest from site metadata.
5
+ *
6
+ * Usage:
7
+ * npx ara-generate https://example.com
8
+ * npx ara-generate https://example.com --output .well-known/ara/manifest.json
9
+ *
10
+ * This tool:
11
+ * 1. Fetches the site's HTML
12
+ * 2. Extracts metadata (title, description, OpenGraph, Schema.org/JSON-LD)
13
+ * 3. Checks for robots.txt and sitemap.xml
14
+ * 4. Generates a Level 1 ARA manifest
15
+ *
16
+ * The generated manifest is a starting point — you'll want to enrich it
17
+ * with Layer 2 schemas and Layer 3 actions manually.
18
+ */
19
+
20
+ const https = require("https");
21
+ const http = require("http");
22
+ const fs = require("fs");
23
+ const path = require("path");
24
+ const { URL } = require("url");
25
+
26
+ // ── Helpers ────────────────────────────────────────────────────────────────
27
+
28
+ function fetchUrl(url, maxRedirects = 5) {
29
+ return new Promise((resolve, reject) => {
30
+ if (maxRedirects <= 0) return reject(new Error("Too many redirects"));
31
+
32
+ const client = url.startsWith("https") ? https : http;
33
+ client
34
+ .get(url, { headers: { "User-Agent": "ARA-Generator/1.0" } }, (res) => {
35
+ if (res.statusCode >= 300 && res.statusCode < 400 && res.headers.location) {
36
+ const redirectUrl = res.headers.location.startsWith("http")
37
+ ? res.headers.location
38
+ : new URL(res.headers.location, url).href;
39
+ return fetchUrl(redirectUrl, maxRedirects - 1).then(resolve).catch(reject);
40
+ }
41
+ let data = "";
42
+ res.on("data", (chunk) => (data += chunk));
43
+ res.on("end", () => resolve({ status: res.statusCode, body: data, url }));
44
+ })
45
+ .on("error", reject);
46
+ });
47
+ }
48
+
49
+ function extractMeta(html, name) {
50
+ const patterns = [
51
+ new RegExp(`<meta\\s+name=["']${name}["']\\s+content=["']([^"']*)["']`, "i"),
52
+ new RegExp(`<meta\\s+content=["']([^"']*)["']\\s+name=["']${name}["']`, "i"),
53
+ new RegExp(`<meta\\s+property=["']${name}["']\\s+content=["']([^"']*)["']`, "i"),
54
+ new RegExp(`<meta\\s+content=["']([^"']*)["']\\s+property=["']${name}["']`, "i"),
55
+ ];
56
+
57
+ for (const pattern of patterns) {
58
+ const match = html.match(pattern);
59
+ if (match) return match[1];
60
+ }
61
+ return null;
62
+ }
63
+
64
+ function extractTitle(html) {
65
+ const match = html.match(/<title[^>]*>([^<]*)<\/title>/i);
66
+ return match ? match[1].trim() : null;
67
+ }
68
+
69
+ function extractJsonLd(html) {
70
+ const results = [];
71
+ const regex = /<script\s+type=["']application\/ld\+json["'][^>]*>([\s\S]*?)<\/script>/gi;
72
+ let match;
73
+
74
+ while ((match = regex.exec(html)) !== null) {
75
+ try {
76
+ results.push(JSON.parse(match[1]));
77
+ } catch {
78
+ // Invalid JSON-LD, skip
79
+ }
80
+ }
81
+
82
+ return results;
83
+ }
84
+
85
+ function detectLanguage(html) {
86
+ const match = html.match(/<html[^>]*\slang=["']([^"']*)["']/i);
87
+ return match ? match[1] : null;
88
+ }
89
+
90
+ function inferSiteType(description, title, jsonLd) {
91
+ const text = `${title || ""} ${description || ""}`.toLowerCase();
92
+
93
+ // Check JSON-LD types first
94
+ for (const ld of jsonLd) {
95
+ const type = ld["@type"] || "";
96
+ if (typeof type === "string") {
97
+ if (type.includes("Store") || type.includes("Product")) return "ecommerce";
98
+ if (type.includes("Restaurant")) return "restaurant";
99
+ if (type.includes("Blog")) return "blog";
100
+ if (type.includes("NewsArticle") || type.includes("NewsMediaOrganization")) return "news_media";
101
+ if (type.includes("SoftwareApplication")) return "saas";
102
+ if (type.includes("RealEstateAgent")) return "real_estate";
103
+ }
104
+ }
105
+
106
+ // Keyword-based inference
107
+ if (text.match(/shop|store|buy|cart|product|ecommerce/)) return "ecommerce";
108
+ if (text.match(/restaurant|menu|dine|reserv/)) return "restaurant";
109
+ if (text.match(/blog|article|post|writing/)) return "blog";
110
+ if (text.match(/news|media|journal/)) return "news_media";
111
+ if (text.match(/saas|software|platform|app|tool|dashboard/)) return "saas";
112
+ if (text.match(/portfolio|freelanc|design|agency/)) return "portfolio";
113
+ if (text.match(/docs|documentation|api|reference/)) return "documentation";
114
+
115
+ return "website";
116
+ }
117
+
118
+ // ── Generator ──────────────────────────────────────────────────────────────
119
+
120
+ async function generate(siteUrl) {
121
+ const baseUrl = siteUrl.replace(/\/$/, "");
122
+ const parsedUrl = new URL(baseUrl);
123
+ const domain = parsedUrl.hostname;
124
+
125
+ console.error(`\n ARA Generator v1.0`);
126
+ console.error(` Analyzing ${baseUrl}...\n`);
127
+
128
+ // Fetch main page
129
+ let html = "";
130
+ try {
131
+ const response = await fetchUrl(baseUrl);
132
+ html = response.body;
133
+ console.error(` ✓ Fetched main page (${html.length} bytes)`);
134
+ } catch (e) {
135
+ console.error(` ✗ Could not fetch ${baseUrl}: ${e.message}`);
136
+ process.exit(1);
137
+ }
138
+
139
+ // Extract metadata
140
+ const title = extractMeta(html, "og:title") || extractTitle(html) || domain;
141
+ const description =
142
+ extractMeta(html, "og:description") ||
143
+ extractMeta(html, "description") ||
144
+ `Website at ${domain}`;
145
+ const locale = detectLanguage(html);
146
+ const image = extractMeta(html, "og:image");
147
+ const jsonLd = extractJsonLd(html);
148
+ const siteType = inferSiteType(description, title, jsonLd);
149
+
150
+ console.error(` ✓ Extracted metadata: "${title}"`);
151
+ console.error(` ✓ Detected type: ${siteType}`);
152
+ if (jsonLd.length > 0) {
153
+ console.error(` ✓ Found ${jsonLd.length} JSON-LD block(s)`);
154
+ }
155
+
156
+ // Check robots.txt
157
+ let hasRobots = false;
158
+ try {
159
+ const robotsResponse = await fetchUrl(`${baseUrl}/robots.txt`);
160
+ hasRobots = robotsResponse.status === 200;
161
+ console.error(` ${hasRobots ? "✓" : "—"} robots.txt ${hasRobots ? "found" : "not found"}`);
162
+ } catch {
163
+ console.error(" — Could not check robots.txt");
164
+ }
165
+
166
+ // Check sitemap
167
+ let hasSitemap = false;
168
+ try {
169
+ const sitemapResponse = await fetchUrl(`${baseUrl}/sitemap.xml`);
170
+ hasSitemap = sitemapResponse.status === 200;
171
+ console.error(` ${hasSitemap ? "✓" : "—"} sitemap.xml ${hasSitemap ? "found" : "not found"}`);
172
+ } catch {
173
+ console.error(" — Could not check sitemap.xml");
174
+ }
175
+
176
+ // Build manifest
177
+ const manifest = {
178
+ $ara: "1.0",
179
+ $schema: "https://ara-standard.org/schema/manifest/v1",
180
+
181
+ identity: {
182
+ name: title.replace(/\s*[-|–—].*$/, "").trim(), // Remove taglines
183
+ type: siteType,
184
+ description: description,
185
+ ...(locale && { locale: [locale] }),
186
+ contact: {
187
+ website: baseUrl,
188
+ },
189
+ ...(image && {
190
+ branding: {
191
+ logo: image,
192
+ },
193
+ }),
194
+ },
195
+
196
+ content_map: {
197
+ summary: `Content from ${domain}`,
198
+ resources: [
199
+ {
200
+ id: "pages",
201
+ type: "content",
202
+ label: "Site Pages",
203
+ description: "Pages available on this website",
204
+ access: "public",
205
+ freshness: "weekly",
206
+ },
207
+ ],
208
+ // TODO: Enrich with detected resources from JSON-LD and sitemap
209
+ },
210
+
211
+ capabilities: {
212
+ protocols: {},
213
+ // TODO: Add detected APIs, MCP endpoints, etc.
214
+ },
215
+
216
+ policies: {
217
+ agent_access: "open",
218
+ rate_limit: {
219
+ requests_per_minute: 30,
220
+ burst: 5,
221
+ },
222
+ data_usage: {
223
+ caching_allowed: true,
224
+ cache_ttl: 3600,
225
+ redistribution: false,
226
+ attribution_required: true,
227
+ },
228
+ },
229
+
230
+ meta: {
231
+ generated_at: new Date().toISOString(),
232
+ generator: "ara-generator/1.0",
233
+ human_site: baseUrl,
234
+ },
235
+ };
236
+
237
+ // Enrich from JSON-LD
238
+ if (jsonLd.length > 0) {
239
+ jsonLd.forEach((ld) => {
240
+ if (ld["@type"] === "Organization" || ld["@type"] === "LocalBusiness") {
241
+ if (ld.name) manifest.identity.name = ld.name;
242
+ if (ld.address) {
243
+ manifest.identity.geo = { address: typeof ld.address === "string" ? ld.address : ld.address.streetAddress };
244
+ }
245
+ if (ld.telephone) manifest.identity.contact.phone = ld.telephone;
246
+ if (ld.email) manifest.identity.contact.email = ld.email;
247
+ }
248
+ });
249
+ }
250
+
251
+ console.error(`\n ✓ Generated ARA manifest (Level 1)`);
252
+ console.error(` ℹ Enrich with schemas (Layer 2) and actions (Layer 3) for full ARA support.\n`);
253
+
254
+ return manifest;
255
+ }
256
+
257
+ // ── CLI ────────────────────────────────────────────────────────────────────
258
+
259
+ async function main() {
260
+ const args = process.argv.slice(2);
261
+ const url = args.find((a) => a.startsWith("http"));
262
+ const outputIdx = args.indexOf("--output");
263
+ const outputFile = outputIdx !== -1 ? args[outputIdx + 1] : null;
264
+
265
+ if (!url || args.includes("--help") || args.includes("-h")) {
266
+ console.log(`
267
+ ARA Generator v1.0
268
+ ===================
269
+
270
+ Generates a basic ARA manifest from site metadata.
271
+
272
+ Usage:
273
+ npx ara-generate <url>
274
+ npx ara-generate <url> --output <file>
275
+
276
+ Examples:
277
+ npx ara-generate https://example.com
278
+ npx ara-generate https://myshop.com --output .well-known/ara/manifest.json
279
+
280
+ The generated manifest is a Level 1 starting point.
281
+ Add schemas (Layer 2) and actions (Layer 3) manually for full ARA support.
282
+ `);
283
+ process.exit(0);
284
+ }
285
+
286
+ const manifest = await generate(url);
287
+ const json = JSON.stringify(manifest, null, 2);
288
+
289
+ if (outputFile) {
290
+ const dir = path.dirname(outputFile);
291
+ if (!fs.existsSync(dir)) fs.mkdirSync(dir, { recursive: true });
292
+ fs.writeFileSync(outputFile, json);
293
+ console.error(` ✓ Saved to ${outputFile}`);
294
+ } else {
295
+ console.log(json);
296
+ }
297
+ }
298
+
299
+ main().catch((err) => {
300
+ console.error("Error:", err.message);
301
+ process.exit(1);
302
+ });
303
+
304
+ module.exports = { generate };
package/package.json ADDED
@@ -0,0 +1,28 @@
1
+ {
2
+ "name": "ara-generate",
3
+ "version": "1.0.0",
4
+ "description": "Generates ARA manifests from existing website metadata",
5
+ "main": "index.js",
6
+ "bin": {
7
+ "ara-generate": "./index.js"
8
+ },
9
+ "keywords": [
10
+ "ara",
11
+ "agent-ready",
12
+ "web-standard",
13
+ "ai-agents",
14
+ "manifest",
15
+ "generator",
16
+ "mcp"
17
+ ],
18
+ "author": "ARA Standard Contributors",
19
+ "license": "MIT",
20
+ "repository": {
21
+ "type": "git",
22
+ "url": "https://github.com/aka9871/ara-standard"
23
+ },
24
+ "homepage": "https://ara-standard.org",
25
+ "engines": {
26
+ "node": ">=16.0.0"
27
+ }
28
+ }