ara-generate 1.0.0 → 2.0.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +102 -24
- package/index.js +716 -113
- package/package.json +12 -4
package/README.md
CHANGED
|
@@ -1,42 +1,120 @@
|
|
|
1
1
|
# ara-generate
|
|
2
2
|
|
|
3
|
-
|
|
3
|
+
> Generate ARA manifests + schemas from any website. Zero dependencies.
|
|
4
4
|
|
|
5
|
-
|
|
5
|
+
Part of the [ARA Standard](https://ara-standard.org) — Agent-Ready Architecture.
|
|
6
|
+
|
|
7
|
+
## Install & Run
|
|
8
|
+
|
|
9
|
+
```bash
|
|
10
|
+
# No install needed — run directly with npx
|
|
11
|
+
npx ara-generate https://yoursite.com
|
|
12
|
+
|
|
13
|
+
# Or install globally
|
|
14
|
+
npm install -g ara-generate
|
|
15
|
+
ara-generate https://yoursite.com
|
|
16
|
+
```
|
|
17
|
+
|
|
18
|
+
## What it does
|
|
19
|
+
|
|
20
|
+
The generator analyzes your website and produces:
|
|
21
|
+
|
|
22
|
+
**Layer 1 — Manifest** (always generated)
|
|
23
|
+
- Extracts identity from `<title>`, OpenGraph, JSON-LD Organization data
|
|
24
|
+
- Detects site type (ecommerce, blog, restaurant, saas, etc.)
|
|
25
|
+
- Detects language, contact info, branding
|
|
26
|
+
- Analyzes sitemap.xml to detect resource patterns
|
|
27
|
+
- Checks for OpenAPI/Swagger endpoints
|
|
28
|
+
|
|
29
|
+
**Layer 2 — Schemas** (when structured data is present)
|
|
30
|
+
- Converts JSON-LD `<script type="application/ld+json">` to ARA schemas
|
|
31
|
+
- Converts Microdata (`itemscope`, `itemprop`) to ARA schemas
|
|
32
|
+
- Generates `search_hints` (filterable_by, sortable_by, text_searchable)
|
|
33
|
+
- Adds semantic annotations (`schema:name`, `schema:price`, etc.)
|
|
34
|
+
|
|
35
|
+
## Usage
|
|
6
36
|
|
|
7
37
|
```bash
|
|
8
|
-
#
|
|
9
|
-
npx ara-generate https://
|
|
38
|
+
# Basic: outputs manifest JSON to stdout
|
|
39
|
+
npx ara-generate https://yoursite.com
|
|
40
|
+
|
|
41
|
+
# Save to .well-known/ara/ directory (manifest.json + schemas/)
|
|
42
|
+
npx ara-generate https://yoursite.com --output .well-known/ara/
|
|
43
|
+
|
|
44
|
+
# Layer 1 only (skip schema detection)
|
|
45
|
+
npx ara-generate https://yoursite.com --layer 1
|
|
46
|
+
|
|
47
|
+
# Crawl internal pages to find more structured data
|
|
48
|
+
npx ara-generate https://yoursite.com --crawl 10
|
|
49
|
+
|
|
50
|
+
# Full generation with crawling
|
|
51
|
+
npx ara-generate https://yoursite.com --crawl 10 --output .well-known/ara/
|
|
52
|
+
```
|
|
53
|
+
|
|
54
|
+
## When does Layer 2 work?
|
|
55
|
+
|
|
56
|
+
Layer 2 schemas are **auto-generated** when your site has structured data:
|
|
57
|
+
|
|
58
|
+
| Source | Confidence | Example |
|
|
59
|
+
|--------|-----------|---------|
|
|
60
|
+
| JSON-LD (`<script type="application/ld+json">`) | **HIGH** | Most modern e-commerce, news sites, recipe sites |
|
|
61
|
+
| Microdata (`itemscope`, `itemprop`) | **MEDIUM** | Older sites, some WordPress themes |
|
|
62
|
+
| OpenAPI/Swagger | **Protocol detection** | API-first sites |
|
|
63
|
+
|
|
64
|
+
### Supported Schema.org types
|
|
65
|
+
|
|
66
|
+
Product, Article, NewsArticle, BlogPosting, Recipe, Event, Restaurant, LocalBusiness, Course
|
|
67
|
+
|
|
68
|
+
### When Layer 2 cannot be auto-generated
|
|
10
69
|
|
|
11
|
-
|
|
12
|
-
|
|
70
|
+
- Site has **no structured data** (plain HTML only) → Layer 1 only, with guidance on what to add
|
|
71
|
+
- Content is **rendered client-side** (SPA without SSR) → Generator sees empty HTML
|
|
72
|
+
- Data is **behind authentication** → Generator can only access public pages
|
|
73
|
+
|
|
74
|
+
In these cases, the generator produces Layer 1 and prints instructions on how to enable Layer 2.
|
|
75
|
+
|
|
76
|
+
## Output structure
|
|
77
|
+
|
|
78
|
+
```
|
|
79
|
+
.well-known/ara/
|
|
80
|
+
├── manifest.json ← Always generated
|
|
81
|
+
└── schemas/ ← Generated when structured data found
|
|
82
|
+
├── product.json
|
|
83
|
+
├── article.json
|
|
84
|
+
└── ...
|
|
13
85
|
```
|
|
14
86
|
|
|
15
|
-
##
|
|
87
|
+
## Example output
|
|
16
88
|
|
|
17
|
-
|
|
18
|
-
|
|
19
|
-
|
|
20
|
-
|
|
21
|
-
|
|
89
|
+
```
|
|
90
|
+
ARA Generator v2.0.0
|
|
91
|
+
Target: https://myshop.com
|
|
92
|
+
Mode: Layer 1 + Layer 2 (schemas)
|
|
93
|
+
─────────────────────────────────────────
|
|
94
|
+
|
|
95
|
+
✓ Fetched main page (45.2 KB)
|
|
96
|
+
✓ Extracted metadata: "MyShop"
|
|
97
|
+
✓ Detected type: ecommerce
|
|
98
|
+
✓ Found 3 JSON-LD block(s): Product, Organization, BreadcrumbList
|
|
99
|
+
✓ sitemap.xml (1,250 URLs, 4 pattern(s))
|
|
22
100
|
|
|
23
|
-
|
|
101
|
+
── Layer 2: Schema Detection ──────────────────
|
|
102
|
+
✓ Generated 1 schema(s) from JSON-LD
|
|
24
103
|
|
|
25
|
-
|
|
26
|
-
|
|
27
|
-
|
|
28
|
-
|
|
29
|
-
|
|
30
|
-
- `meta` (generation timestamp, source URL)
|
|
104
|
+
── Results ───────────────────────────────────
|
|
105
|
+
✓ Generated ARA manifest (Layer 2)
|
|
106
|
+
Identity: MyShop (ecommerce)
|
|
107
|
+
Resources: 5
|
|
108
|
+
Schemas: 1
|
|
31
109
|
|
|
32
|
-
|
|
33
|
-
|
|
34
|
-
|
|
110
|
+
✓ Saved .well-known/ara/manifest.json
|
|
111
|
+
✓ Saved .well-known/ara/schemas/product.json
|
|
112
|
+
```
|
|
35
113
|
|
|
36
114
|
## Links
|
|
37
115
|
|
|
38
|
-
- **
|
|
39
|
-
- **
|
|
116
|
+
- **Standard**: [ara-standard.org](https://ara-standard.org)
|
|
117
|
+
- **Specification**: [GitHub](https://github.com/aka9871/ara-standard)
|
|
40
118
|
- **Validator**: `npx ara-validate https://yoursite.com`
|
|
41
119
|
|
|
42
120
|
## License
|
package/index.js
CHANGED
|
@@ -1,20 +1,25 @@
|
|
|
1
1
|
#!/usr/bin/env node
|
|
2
2
|
|
|
3
3
|
/**
|
|
4
|
-
* ARA Generator — Generates
|
|
4
|
+
* ARA Generator v2.0 — Generates ARA manifests + schemas from site metadata.
|
|
5
5
|
*
|
|
6
6
|
* Usage:
|
|
7
7
|
* npx ara-generate https://example.com
|
|
8
|
-
* npx ara-generate https://example.com --output .well-known/ara/
|
|
8
|
+
* npx ara-generate https://example.com --output .well-known/ara/
|
|
9
|
+
* npx ara-generate https://example.com --layer 1 (manifest only)
|
|
10
|
+
* npx ara-generate https://example.com --layer 2 (manifest + schemas)
|
|
11
|
+
* npx ara-generate https://example.com --crawl 5 (follow up to 5 pages)
|
|
9
12
|
*
|
|
10
|
-
*
|
|
11
|
-
*
|
|
12
|
-
*
|
|
13
|
-
*
|
|
14
|
-
*
|
|
13
|
+
* Layer 2 schema generation works best when the site has:
|
|
14
|
+
* - JSON-LD structured data (Schema.org Product, Article, Recipe, Event, etc.)
|
|
15
|
+
* - OpenGraph metadata with typed content
|
|
16
|
+
* - Microdata attributes (itemscope, itemprop)
|
|
17
|
+
* - An OpenAPI/Swagger endpoint
|
|
15
18
|
*
|
|
16
|
-
*
|
|
17
|
-
*
|
|
19
|
+
* If none of these are present, the generator produces Layer 1 only and
|
|
20
|
+
* explains what the site owner needs to add manually.
|
|
21
|
+
*
|
|
22
|
+
* Zero dependencies — uses only Node.js stdlib.
|
|
18
23
|
*/
|
|
19
24
|
|
|
20
25
|
const https = require("https");
|
|
@@ -23,7 +28,199 @@ const fs = require("fs");
|
|
|
23
28
|
const path = require("path");
|
|
24
29
|
const { URL } = require("url");
|
|
25
30
|
|
|
26
|
-
// ──
|
|
31
|
+
// ── Constants ─────────────────────────────────────────────────────────────
|
|
32
|
+
|
|
33
|
+
const VERSION = "2.0.0";
|
|
34
|
+
const USER_AGENT = `ARA-Generator/${VERSION}`;
|
|
35
|
+
|
|
36
|
+
// Known JSON-LD types → ARA schema mappings
|
|
37
|
+
const JSONLD_TYPE_MAP = {
|
|
38
|
+
Product: {
|
|
39
|
+
resource_type: "catalog",
|
|
40
|
+
schema_name: "product",
|
|
41
|
+
properties: {
|
|
42
|
+
name: { type: "string", required: true, semantic: "schema:name" },
|
|
43
|
+
description: { type: "string", semantic: "schema:description" },
|
|
44
|
+
price: { type: "number", required: true, semantic: "schema:price" },
|
|
45
|
+
currency: { type: "string", semantic: "schema:priceCurrency" },
|
|
46
|
+
image: { type: "string", format: "uri", semantic: "schema:image" },
|
|
47
|
+
url: { type: "string", format: "uri", semantic: "schema:url" },
|
|
48
|
+
sku: { type: "string", semantic: "schema:sku" },
|
|
49
|
+
brand: { type: "string", semantic: "schema:brand" },
|
|
50
|
+
availability: { type: "string", semantic: "schema:availability" },
|
|
51
|
+
category: { type: "string", semantic: "schema:category" },
|
|
52
|
+
rating: { type: "number", semantic: "schema:aggregateRating" },
|
|
53
|
+
review_count: { type: "integer", semantic: "schema:reviewCount" },
|
|
54
|
+
},
|
|
55
|
+
search_hints: {
|
|
56
|
+
filterable_by: ["category", "brand", "availability"],
|
|
57
|
+
sortable_by: ["price", "rating", "name"],
|
|
58
|
+
text_searchable: ["name", "description"],
|
|
59
|
+
},
|
|
60
|
+
},
|
|
61
|
+
Article: {
|
|
62
|
+
resource_type: "content",
|
|
63
|
+
schema_name: "article",
|
|
64
|
+
properties: {
|
|
65
|
+
title: { type: "string", required: true, semantic: "schema:headline" },
|
|
66
|
+
description: { type: "string", semantic: "schema:description" },
|
|
67
|
+
author: { type: "string", semantic: "schema:author" },
|
|
68
|
+
date_published: { type: "string", format: "date-time", semantic: "schema:datePublished" },
|
|
69
|
+
date_modified: { type: "string", format: "date-time", semantic: "schema:dateModified" },
|
|
70
|
+
image: { type: "string", format: "uri", semantic: "schema:image" },
|
|
71
|
+
url: { type: "string", format: "uri", semantic: "schema:url" },
|
|
72
|
+
category: { type: "string", semantic: "schema:articleSection" },
|
|
73
|
+
word_count: { type: "integer", semantic: "schema:wordCount" },
|
|
74
|
+
},
|
|
75
|
+
search_hints: {
|
|
76
|
+
filterable_by: ["category", "author"],
|
|
77
|
+
sortable_by: ["date_published", "title"],
|
|
78
|
+
text_searchable: ["title", "description"],
|
|
79
|
+
},
|
|
80
|
+
},
|
|
81
|
+
NewsArticle: {
|
|
82
|
+
resource_type: "content",
|
|
83
|
+
schema_name: "article",
|
|
84
|
+
properties: {
|
|
85
|
+
title: { type: "string", required: true, semantic: "schema:headline" },
|
|
86
|
+
description: { type: "string", semantic: "schema:description" },
|
|
87
|
+
author: { type: "string", semantic: "schema:author" },
|
|
88
|
+
date_published: { type: "string", format: "date-time", semantic: "schema:datePublished" },
|
|
89
|
+
image: { type: "string", format: "uri", semantic: "schema:image" },
|
|
90
|
+
url: { type: "string", format: "uri", semantic: "schema:url" },
|
|
91
|
+
section: { type: "string", semantic: "schema:articleSection" },
|
|
92
|
+
},
|
|
93
|
+
search_hints: {
|
|
94
|
+
filterable_by: ["section", "author"],
|
|
95
|
+
sortable_by: ["date_published"],
|
|
96
|
+
text_searchable: ["title", "description"],
|
|
97
|
+
},
|
|
98
|
+
},
|
|
99
|
+
BlogPosting: {
|
|
100
|
+
resource_type: "content",
|
|
101
|
+
schema_name: "article",
|
|
102
|
+
properties: {
|
|
103
|
+
title: { type: "string", required: true, semantic: "schema:headline" },
|
|
104
|
+
description: { type: "string", semantic: "schema:description" },
|
|
105
|
+
author: { type: "string", semantic: "schema:author" },
|
|
106
|
+
date_published: { type: "string", format: "date-time", semantic: "schema:datePublished" },
|
|
107
|
+
image: { type: "string", format: "uri", semantic: "schema:image" },
|
|
108
|
+
url: { type: "string", format: "uri", semantic: "schema:url" },
|
|
109
|
+
category: { type: "string", semantic: "schema:articleSection" },
|
|
110
|
+
},
|
|
111
|
+
search_hints: {
|
|
112
|
+
filterable_by: ["category", "author"],
|
|
113
|
+
sortable_by: ["date_published", "title"],
|
|
114
|
+
text_searchable: ["title", "description"],
|
|
115
|
+
},
|
|
116
|
+
},
|
|
117
|
+
Recipe: {
|
|
118
|
+
resource_type: "catalog",
|
|
119
|
+
schema_name: "recipe",
|
|
120
|
+
properties: {
|
|
121
|
+
name: { type: "string", required: true, semantic: "schema:name" },
|
|
122
|
+
description: { type: "string", semantic: "schema:description" },
|
|
123
|
+
image: { type: "string", format: "uri", semantic: "schema:image" },
|
|
124
|
+
author: { type: "string", semantic: "schema:author" },
|
|
125
|
+
prep_time: { type: "string", semantic: "schema:prepTime" },
|
|
126
|
+
cook_time: { type: "string", semantic: "schema:cookTime" },
|
|
127
|
+
total_time: { type: "string", semantic: "schema:totalTime" },
|
|
128
|
+
category: { type: "string", semantic: "schema:recipeCategory" },
|
|
129
|
+
cuisine: { type: "string", semantic: "schema:recipeCuisine" },
|
|
130
|
+
servings: { type: "string", semantic: "schema:recipeYield" },
|
|
131
|
+
calories: { type: "number", semantic: "schema:calories" },
|
|
132
|
+
rating: { type: "number", semantic: "schema:aggregateRating" },
|
|
133
|
+
ingredients: { type: "array", items: { type: "string" }, semantic: "schema:recipeIngredient" },
|
|
134
|
+
},
|
|
135
|
+
search_hints: {
|
|
136
|
+
filterable_by: ["category", "cuisine"],
|
|
137
|
+
sortable_by: ["rating", "total_time", "name"],
|
|
138
|
+
text_searchable: ["name", "description", "ingredients"],
|
|
139
|
+
},
|
|
140
|
+
},
|
|
141
|
+
Event: {
|
|
142
|
+
resource_type: "catalog",
|
|
143
|
+
schema_name: "event",
|
|
144
|
+
properties: {
|
|
145
|
+
name: { type: "string", required: true, semantic: "schema:name" },
|
|
146
|
+
description: { type: "string", semantic: "schema:description" },
|
|
147
|
+
start_date: { type: "string", format: "date-time", required: true, semantic: "schema:startDate" },
|
|
148
|
+
end_date: { type: "string", format: "date-time", semantic: "schema:endDate" },
|
|
149
|
+
location: { type: "string", semantic: "schema:location" },
|
|
150
|
+
image: { type: "string", format: "uri", semantic: "schema:image" },
|
|
151
|
+
url: { type: "string", format: "uri", semantic: "schema:url" },
|
|
152
|
+
organizer: { type: "string", semantic: "schema:organizer" },
|
|
153
|
+
price: { type: "number", semantic: "schema:offers.price" },
|
|
154
|
+
availability: { type: "string", semantic: "schema:offers.availability" },
|
|
155
|
+
},
|
|
156
|
+
search_hints: {
|
|
157
|
+
filterable_by: ["location", "organizer"],
|
|
158
|
+
sortable_by: ["start_date", "price", "name"],
|
|
159
|
+
text_searchable: ["name", "description"],
|
|
160
|
+
},
|
|
161
|
+
},
|
|
162
|
+
Restaurant: {
|
|
163
|
+
resource_type: "catalog",
|
|
164
|
+
schema_name: "restaurant",
|
|
165
|
+
properties: {
|
|
166
|
+
name: { type: "string", required: true, semantic: "schema:name" },
|
|
167
|
+
description: { type: "string", semantic: "schema:description" },
|
|
168
|
+
cuisine: { type: "string", semantic: "schema:servesCuisine" },
|
|
169
|
+
address: { type: "string", semantic: "schema:address" },
|
|
170
|
+
phone: { type: "string", semantic: "schema:telephone" },
|
|
171
|
+
price_range: { type: "string", semantic: "schema:priceRange" },
|
|
172
|
+
rating: { type: "number", semantic: "schema:aggregateRating" },
|
|
173
|
+
hours: { type: "string", semantic: "schema:openingHours" },
|
|
174
|
+
image: { type: "string", format: "uri", semantic: "schema:image" },
|
|
175
|
+
url: { type: "string", format: "uri", semantic: "schema:url" },
|
|
176
|
+
},
|
|
177
|
+
search_hints: {
|
|
178
|
+
filterable_by: ["cuisine", "price_range"],
|
|
179
|
+
sortable_by: ["rating", "name"],
|
|
180
|
+
text_searchable: ["name", "description", "cuisine"],
|
|
181
|
+
},
|
|
182
|
+
},
|
|
183
|
+
LocalBusiness: {
|
|
184
|
+
resource_type: "catalog",
|
|
185
|
+
schema_name: "business",
|
|
186
|
+
properties: {
|
|
187
|
+
name: { type: "string", required: true, semantic: "schema:name" },
|
|
188
|
+
description: { type: "string", semantic: "schema:description" },
|
|
189
|
+
address: { type: "string", semantic: "schema:address" },
|
|
190
|
+
phone: { type: "string", semantic: "schema:telephone" },
|
|
191
|
+
email: { type: "string", format: "email", semantic: "schema:email" },
|
|
192
|
+
hours: { type: "string", semantic: "schema:openingHours" },
|
|
193
|
+
rating: { type: "number", semantic: "schema:aggregateRating" },
|
|
194
|
+
image: { type: "string", format: "uri", semantic: "schema:image" },
|
|
195
|
+
url: { type: "string", format: "uri", semantic: "schema:url" },
|
|
196
|
+
},
|
|
197
|
+
search_hints: {
|
|
198
|
+
filterable_by: ["address"],
|
|
199
|
+
sortable_by: ["rating", "name"],
|
|
200
|
+
text_searchable: ["name", "description"],
|
|
201
|
+
},
|
|
202
|
+
},
|
|
203
|
+
Course: {
|
|
204
|
+
resource_type: "catalog",
|
|
205
|
+
schema_name: "course",
|
|
206
|
+
properties: {
|
|
207
|
+
name: { type: "string", required: true, semantic: "schema:name" },
|
|
208
|
+
description: { type: "string", semantic: "schema:description" },
|
|
209
|
+
provider: { type: "string", semantic: "schema:provider" },
|
|
210
|
+
url: { type: "string", format: "uri", semantic: "schema:url" },
|
|
211
|
+
language: { type: "string", semantic: "schema:inLanguage" },
|
|
212
|
+
price: { type: "number", semantic: "schema:offers.price" },
|
|
213
|
+
duration: { type: "string", semantic: "schema:timeRequired" },
|
|
214
|
+
},
|
|
215
|
+
search_hints: {
|
|
216
|
+
filterable_by: ["provider", "language"],
|
|
217
|
+
sortable_by: ["price", "name"],
|
|
218
|
+
text_searchable: ["name", "description"],
|
|
219
|
+
},
|
|
220
|
+
},
|
|
221
|
+
};
|
|
222
|
+
|
|
223
|
+
// ── HTTP Fetcher ──────────────────────────────────────────────────────────
|
|
27
224
|
|
|
28
225
|
function fetchUrl(url, maxRedirects = 5) {
|
|
29
226
|
return new Promise((resolve, reject) => {
|
|
@@ -31,7 +228,7 @@ function fetchUrl(url, maxRedirects = 5) {
|
|
|
31
228
|
|
|
32
229
|
const client = url.startsWith("https") ? https : http;
|
|
33
230
|
client
|
|
34
|
-
.get(url, { headers: { "User-Agent":
|
|
231
|
+
.get(url, { headers: { "User-Agent": USER_AGENT }, timeout: 10000 }, (res) => {
|
|
35
232
|
if (res.statusCode >= 300 && res.statusCode < 400 && res.headers.location) {
|
|
36
233
|
const redirectUrl = res.headers.location.startsWith("http")
|
|
37
234
|
? res.headers.location
|
|
@@ -42,10 +239,13 @@ function fetchUrl(url, maxRedirects = 5) {
|
|
|
42
239
|
res.on("data", (chunk) => (data += chunk));
|
|
43
240
|
res.on("end", () => resolve({ status: res.statusCode, body: data, url }));
|
|
44
241
|
})
|
|
45
|
-
.on("error", reject)
|
|
242
|
+
.on("error", reject)
|
|
243
|
+
.on("timeout", () => reject(new Error("Request timed out")));
|
|
46
244
|
});
|
|
47
245
|
}
|
|
48
246
|
|
|
247
|
+
// ── HTML Extractors ───────────────────────────────────────────────────────
|
|
248
|
+
|
|
49
249
|
function extractMeta(html, name) {
|
|
50
250
|
const patterns = [
|
|
51
251
|
new RegExp(`<meta\\s+name=["']${name}["']\\s+content=["']([^"']*)["']`, "i"),
|
|
@@ -53,7 +253,6 @@ function extractMeta(html, name) {
|
|
|
53
253
|
new RegExp(`<meta\\s+property=["']${name}["']\\s+content=["']([^"']*)["']`, "i"),
|
|
54
254
|
new RegExp(`<meta\\s+content=["']([^"']*)["']\\s+property=["']${name}["']`, "i"),
|
|
55
255
|
];
|
|
56
|
-
|
|
57
256
|
for (const pattern of patterns) {
|
|
58
257
|
const match = html.match(pattern);
|
|
59
258
|
if (match) return match[1];
|
|
@@ -70,40 +269,89 @@ function extractJsonLd(html) {
|
|
|
70
269
|
const results = [];
|
|
71
270
|
const regex = /<script\s+type=["']application\/ld\+json["'][^>]*>([\s\S]*?)<\/script>/gi;
|
|
72
271
|
let match;
|
|
73
|
-
|
|
74
272
|
while ((match = regex.exec(html)) !== null) {
|
|
75
273
|
try {
|
|
76
|
-
|
|
274
|
+
const parsed = JSON.parse(match[1]);
|
|
275
|
+
// Handle @graph arrays
|
|
276
|
+
if (parsed["@graph"] && Array.isArray(parsed["@graph"])) {
|
|
277
|
+
results.push(...parsed["@graph"]);
|
|
278
|
+
} else {
|
|
279
|
+
results.push(parsed);
|
|
280
|
+
}
|
|
77
281
|
} catch {
|
|
78
282
|
// Invalid JSON-LD, skip
|
|
79
283
|
}
|
|
80
284
|
}
|
|
81
|
-
|
|
82
285
|
return results;
|
|
83
286
|
}
|
|
84
287
|
|
|
288
|
+
function extractMicrodata(html) {
|
|
289
|
+
const items = [];
|
|
290
|
+
const scopeRegex = /itemscope\s+itemtype=["']https?:\/\/schema\.org\/(\w+)["']/gi;
|
|
291
|
+
let match;
|
|
292
|
+
while ((match = scopeRegex.exec(html)) !== null) {
|
|
293
|
+
items.push(match[1]);
|
|
294
|
+
}
|
|
295
|
+
return [...new Set(items)];
|
|
296
|
+
}
|
|
297
|
+
|
|
85
298
|
function detectLanguage(html) {
|
|
86
299
|
const match = html.match(/<html[^>]*\slang=["']([^"']*)["']/i);
|
|
87
300
|
return match ? match[1] : null;
|
|
88
301
|
}
|
|
89
302
|
|
|
303
|
+
function extractInternalLinks(html, baseUrl) {
|
|
304
|
+
const links = new Set();
|
|
305
|
+
const parsed = new URL(baseUrl);
|
|
306
|
+
const regex = /href=["'](\/[^"'#]*|https?:\/\/[^"'#]*)["']/gi;
|
|
307
|
+
let match;
|
|
308
|
+
while ((match = regex.exec(html)) !== null) {
|
|
309
|
+
try {
|
|
310
|
+
const absolute = match[1].startsWith("http") ? match[1] : new URL(match[1], baseUrl).href;
|
|
311
|
+
const linkUrl = new URL(absolute);
|
|
312
|
+
if (linkUrl.hostname === parsed.hostname) {
|
|
313
|
+
links.add(linkUrl.origin + linkUrl.pathname);
|
|
314
|
+
}
|
|
315
|
+
} catch {
|
|
316
|
+
// Skip invalid URLs
|
|
317
|
+
}
|
|
318
|
+
}
|
|
319
|
+
return [...links];
|
|
320
|
+
}
|
|
321
|
+
|
|
322
|
+
function detectOpenApiUrl(html, baseUrl) {
|
|
323
|
+
// Look for common OpenAPI/Swagger patterns
|
|
324
|
+
const patterns = [
|
|
325
|
+
/["'](\/api\/(?:v\d+\/)?(?:openapi|swagger)\.(?:json|yaml))["']/i,
|
|
326
|
+
/["'](\/swagger\.(?:json|yaml))["']/i,
|
|
327
|
+
/["'](\/api-docs)["']/i,
|
|
328
|
+
/["'](https?:\/\/[^"']*(?:openapi|swagger)\.(?:json|yaml))["']/i,
|
|
329
|
+
];
|
|
330
|
+
for (const pattern of patterns) {
|
|
331
|
+
const match = html.match(pattern);
|
|
332
|
+
if (match) {
|
|
333
|
+
return match[1].startsWith("http") ? match[1] : new URL(match[1], baseUrl).href;
|
|
334
|
+
}
|
|
335
|
+
}
|
|
336
|
+
return null;
|
|
337
|
+
}
|
|
338
|
+
|
|
90
339
|
function inferSiteType(description, title, jsonLd) {
|
|
91
340
|
const text = `${title || ""} ${description || ""}`.toLowerCase();
|
|
92
341
|
|
|
93
|
-
// Check JSON-LD types first
|
|
94
342
|
for (const ld of jsonLd) {
|
|
95
343
|
const type = ld["@type"] || "";
|
|
96
344
|
if (typeof type === "string") {
|
|
97
345
|
if (type.includes("Store") || type.includes("Product")) return "ecommerce";
|
|
98
346
|
if (type.includes("Restaurant")) return "restaurant";
|
|
99
|
-
if (type.includes("Blog")) return "blog";
|
|
347
|
+
if (type.includes("Blog") || type.includes("BlogPosting")) return "blog";
|
|
100
348
|
if (type.includes("NewsArticle") || type.includes("NewsMediaOrganization")) return "news_media";
|
|
101
349
|
if (type.includes("SoftwareApplication")) return "saas";
|
|
102
350
|
if (type.includes("RealEstateAgent")) return "real_estate";
|
|
351
|
+
if (type.includes("Course")) return "education";
|
|
103
352
|
}
|
|
104
353
|
}
|
|
105
354
|
|
|
106
|
-
// Keyword-based inference
|
|
107
355
|
if (text.match(/shop|store|buy|cart|product|ecommerce/)) return "ecommerce";
|
|
108
356
|
if (text.match(/restaurant|menu|dine|reserv/)) return "restaurant";
|
|
109
357
|
if (text.match(/blog|article|post|writing/)) return "blog";
|
|
@@ -111,32 +359,148 @@ function inferSiteType(description, title, jsonLd) {
|
|
|
111
359
|
if (text.match(/saas|software|platform|app|tool|dashboard/)) return "saas";
|
|
112
360
|
if (text.match(/portfolio|freelanc|design|agency/)) return "portfolio";
|
|
113
361
|
if (text.match(/docs|documentation|api|reference/)) return "documentation";
|
|
362
|
+
if (text.match(/cours|learn|education|formation|training/)) return "education";
|
|
114
363
|
|
|
115
364
|
return "website";
|
|
116
365
|
}
|
|
117
366
|
|
|
118
|
-
// ──
|
|
367
|
+
// ── Schema Builder (Layer 2) ──────────────────────────────────────────────
|
|
368
|
+
|
|
369
|
+
function buildSchemasFromJsonLd(jsonLdItems) {
|
|
370
|
+
const schemas = {};
|
|
371
|
+
const detectedTypes = new Set();
|
|
372
|
+
|
|
373
|
+
for (const item of jsonLdItems) {
|
|
374
|
+
let type = item["@type"];
|
|
375
|
+
if (Array.isArray(type)) type = type[0];
|
|
376
|
+
if (!type || typeof type !== "string") continue;
|
|
377
|
+
|
|
378
|
+
// Normalize type
|
|
379
|
+
const cleanType = type.replace("https://schema.org/", "").replace("http://schema.org/", "");
|
|
380
|
+
|
|
381
|
+
if (JSONLD_TYPE_MAP[cleanType] && !detectedTypes.has(cleanType)) {
|
|
382
|
+
detectedTypes.add(cleanType);
|
|
383
|
+
const mapping = JSONLD_TYPE_MAP[cleanType];
|
|
384
|
+
const schemaName = mapping.schema_name;
|
|
385
|
+
|
|
386
|
+
// Build properties by enriching base mapping with actual data
|
|
387
|
+
const enrichedProperties = { ...mapping.properties };
|
|
388
|
+
|
|
389
|
+
// Detect extra properties from the actual JSON-LD data
|
|
390
|
+
for (const [key, value] of Object.entries(item)) {
|
|
391
|
+
if (key.startsWith("@")) continue;
|
|
392
|
+
const normalizedKey = key
|
|
393
|
+
.replace(/([A-Z])/g, "_$1")
|
|
394
|
+
.toLowerCase()
|
|
395
|
+
.replace(/^_/, "");
|
|
396
|
+
|
|
397
|
+
if (!enrichedProperties[normalizedKey] && typeof value !== "object") {
|
|
398
|
+
enrichedProperties[normalizedKey] = {
|
|
399
|
+
type: typeof value === "number" ? "number" : "string",
|
|
400
|
+
semantic: `schema:${key}`,
|
|
401
|
+
_auto_detected: true,
|
|
402
|
+
};
|
|
403
|
+
}
|
|
404
|
+
}
|
|
405
|
+
|
|
406
|
+
schemas[schemaName] = {
|
|
407
|
+
$ara_schema: "1.0",
|
|
408
|
+
resource_id: schemaName + "s",
|
|
409
|
+
label: cleanType + "s",
|
|
410
|
+
description: `Schema for ${cleanType} resources — auto-generated from JSON-LD`,
|
|
411
|
+
properties: enrichedProperties,
|
|
412
|
+
search_hints: mapping.search_hints,
|
|
413
|
+
_source: "json-ld",
|
|
414
|
+
_source_type: cleanType,
|
|
415
|
+
_confidence: "high",
|
|
416
|
+
_note: "Auto-generated from JSON-LD structured data. Review and adjust properties as needed.",
|
|
417
|
+
};
|
|
418
|
+
}
|
|
419
|
+
}
|
|
420
|
+
|
|
421
|
+
return schemas;
|
|
422
|
+
}
|
|
423
|
+
|
|
424
|
+
function buildSchemasFromMicrodata(microdataTypes) {
|
|
425
|
+
const schemas = {};
|
|
426
|
+
|
|
427
|
+
for (const type of microdataTypes) {
|
|
428
|
+
if (JSONLD_TYPE_MAP[type] && !schemas[JSONLD_TYPE_MAP[type].schema_name]) {
|
|
429
|
+
const mapping = JSONLD_TYPE_MAP[type];
|
|
430
|
+
|
|
431
|
+
schemas[mapping.schema_name] = {
|
|
432
|
+
$ara_schema: "1.0",
|
|
433
|
+
resource_id: mapping.schema_name + "s",
|
|
434
|
+
label: type + "s",
|
|
435
|
+
description: `Schema for ${type} resources — auto-generated from Microdata`,
|
|
436
|
+
properties: mapping.properties,
|
|
437
|
+
search_hints: mapping.search_hints,
|
|
438
|
+
_source: "microdata",
|
|
439
|
+
_source_type: type,
|
|
440
|
+
_confidence: "medium",
|
|
441
|
+
_note:
|
|
442
|
+
"Auto-generated from Microdata attributes. Properties are based on Schema.org " +
|
|
443
|
+
"defaults — the actual data may have additional fields. Review and enrich manually.",
|
|
444
|
+
};
|
|
445
|
+
}
|
|
446
|
+
}
|
|
119
447
|
|
|
120
|
-
|
|
448
|
+
return schemas;
|
|
449
|
+
}
|
|
450
|
+
|
|
451
|
+
// ── Sitemap Analyzer ──────────────────────────────────────────────────────
|
|
452
|
+
|
|
453
|
+
function analyzeSitemap(sitemapXml) {
|
|
454
|
+
const urls = [];
|
|
455
|
+
const locRegex = /<loc>([^<]+)<\/loc>/g;
|
|
456
|
+
let match;
|
|
457
|
+
while ((match = locRegex.exec(sitemapXml)) !== null) {
|
|
458
|
+
urls.push(match[1]);
|
|
459
|
+
}
|
|
460
|
+
|
|
461
|
+
// Detect URL patterns → resource types
|
|
462
|
+
const patterns = {};
|
|
463
|
+
for (const url of urls) {
|
|
464
|
+
const path = new URL(url).pathname;
|
|
465
|
+
const segments = path.split("/").filter(Boolean);
|
|
466
|
+
if (segments.length >= 2) {
|
|
467
|
+
const prefix = segments[0];
|
|
468
|
+
if (!patterns[prefix]) patterns[prefix] = { count: 0, examples: [] };
|
|
469
|
+
patterns[prefix].count++;
|
|
470
|
+
if (patterns[prefix].examples.length < 3) patterns[prefix].examples.push(path);
|
|
471
|
+
}
|
|
472
|
+
}
|
|
473
|
+
|
|
474
|
+
return { total_urls: urls.length, patterns };
|
|
475
|
+
}
|
|
476
|
+
|
|
477
|
+
// ── Main Generator ────────────────────────────────────────────────────────
|
|
478
|
+
|
|
479
|
+
async function generate(siteUrl, options = {}) {
|
|
480
|
+
const { maxLayer = 2, crawlPages = 0 } = options;
|
|
121
481
|
const baseUrl = siteUrl.replace(/\/$/, "");
|
|
122
482
|
const parsedUrl = new URL(baseUrl);
|
|
123
483
|
const domain = parsedUrl.hostname;
|
|
124
484
|
|
|
125
|
-
console.error(
|
|
126
|
-
console.error(` Analyzing ${baseUrl}...\n`);
|
|
485
|
+
const log = (msg) => console.error(msg);
|
|
127
486
|
|
|
128
|
-
|
|
487
|
+
log(`\n ARA Generator v${VERSION}`);
|
|
488
|
+
log(` Target: ${baseUrl}`);
|
|
489
|
+
log(` Mode: Layer 1${maxLayer >= 2 ? " + Layer 2 (schemas)" : ""}`);
|
|
490
|
+
log(` ─────────────────────────────────────────\n`);
|
|
491
|
+
|
|
492
|
+
// ── Step 1: Fetch main page ─────────────────────────────────────────
|
|
129
493
|
let html = "";
|
|
130
494
|
try {
|
|
131
495
|
const response = await fetchUrl(baseUrl);
|
|
132
496
|
html = response.body;
|
|
133
|
-
|
|
497
|
+
log(` ✓ Fetched main page (${(html.length / 1024).toFixed(1)} KB)`);
|
|
134
498
|
} catch (e) {
|
|
135
|
-
|
|
499
|
+
log(` ✗ Could not fetch ${baseUrl}: ${e.message}`);
|
|
136
500
|
process.exit(1);
|
|
137
501
|
}
|
|
138
502
|
|
|
139
|
-
// Extract metadata
|
|
503
|
+
// ── Step 2: Extract all metadata ────────────────────────────────────
|
|
140
504
|
const title = extractMeta(html, "og:title") || extractTitle(html) || domain;
|
|
141
505
|
const description =
|
|
142
506
|
extractMeta(html, "og:description") ||
|
|
@@ -145,80 +509,225 @@ async function generate(siteUrl) {
|
|
|
145
509
|
const locale = detectLanguage(html);
|
|
146
510
|
const image = extractMeta(html, "og:image");
|
|
147
511
|
const jsonLd = extractJsonLd(html);
|
|
512
|
+
const microdataTypes = extractMicrodata(html);
|
|
513
|
+
const openApiUrl = detectOpenApiUrl(html, baseUrl);
|
|
148
514
|
const siteType = inferSiteType(description, title, jsonLd);
|
|
149
515
|
|
|
150
|
-
|
|
151
|
-
|
|
152
|
-
if (jsonLd.length > 0) {
|
|
153
|
-
|
|
516
|
+
log(` ✓ Extracted metadata: "${title}"`);
|
|
517
|
+
log(` ✓ Detected type: ${siteType}`);
|
|
518
|
+
if (jsonLd.length > 0) log(` ✓ Found ${jsonLd.length} JSON-LD block(s): ${jsonLd.map((ld) => ld["@type"]).filter(Boolean).join(", ")}`);
|
|
519
|
+
if (microdataTypes.length > 0) log(` ✓ Found Microdata: ${microdataTypes.join(", ")}`);
|
|
520
|
+
if (openApiUrl) log(` ✓ Detected OpenAPI: ${openApiUrl}`);
|
|
521
|
+
|
|
522
|
+
// ── Step 3: Crawl additional pages (optional) ───────────────────────
|
|
523
|
+
let additionalJsonLd = [];
|
|
524
|
+
let additionalMicrodata = [];
|
|
525
|
+
|
|
526
|
+
if (crawlPages > 0) {
|
|
527
|
+
const links = extractInternalLinks(html, baseUrl);
|
|
528
|
+
const toCrawl = links.slice(0, crawlPages);
|
|
529
|
+
log(`\n Crawling ${toCrawl.length} additional page(s)...`);
|
|
530
|
+
|
|
531
|
+
for (const link of toCrawl) {
|
|
532
|
+
try {
|
|
533
|
+
const resp = await fetchUrl(link);
|
|
534
|
+
const pageJsonLd = extractJsonLd(resp.body);
|
|
535
|
+
const pageMicrodata = extractMicrodata(resp.body);
|
|
536
|
+
additionalJsonLd.push(...pageJsonLd);
|
|
537
|
+
additionalMicrodata.push(...pageMicrodata);
|
|
538
|
+
log(` ✓ ${new URL(link).pathname} — ${pageJsonLd.length} JSON-LD, ${pageMicrodata.length} Microdata`);
|
|
539
|
+
} catch {
|
|
540
|
+
log(` — ${new URL(link).pathname} — skipped`);
|
|
541
|
+
}
|
|
542
|
+
}
|
|
154
543
|
}
|
|
155
544
|
|
|
156
|
-
|
|
545
|
+
const allJsonLd = [...jsonLd, ...additionalJsonLd];
|
|
546
|
+
const allMicrodata = [...new Set([...microdataTypes, ...additionalMicrodata])];
|
|
547
|
+
|
|
548
|
+
// ── Step 4: Check robots.txt & sitemap ──────────────────────────────
|
|
157
549
|
let hasRobots = false;
|
|
550
|
+
let hasSitemap = false;
|
|
551
|
+
let sitemapData = null;
|
|
552
|
+
|
|
158
553
|
try {
|
|
159
|
-
const
|
|
160
|
-
hasRobots =
|
|
161
|
-
|
|
554
|
+
const robotsResp = await fetchUrl(`${baseUrl}/robots.txt`);
|
|
555
|
+
hasRobots = robotsResp.status === 200 && robotsResp.body.length > 10;
|
|
556
|
+
log(`\n ${hasRobots ? "✓" : "—"} robots.txt`);
|
|
162
557
|
} catch {
|
|
163
|
-
|
|
558
|
+
log(" — robots.txt (unreachable)");
|
|
164
559
|
}
|
|
165
560
|
|
|
166
|
-
// Check sitemap
|
|
167
|
-
let hasSitemap = false;
|
|
168
561
|
try {
|
|
169
|
-
const
|
|
170
|
-
hasSitemap =
|
|
171
|
-
|
|
562
|
+
const sitemapResp = await fetchUrl(`${baseUrl}/sitemap.xml`);
|
|
563
|
+
hasSitemap = sitemapResp.status === 200 && sitemapResp.body.includes("<url");
|
|
564
|
+
if (hasSitemap) {
|
|
565
|
+
sitemapData = analyzeSitemap(sitemapResp.body);
|
|
566
|
+
log(` ✓ sitemap.xml (${sitemapData.total_urls} URLs, ${Object.keys(sitemapData.patterns).length} pattern(s))`);
|
|
567
|
+
} else {
|
|
568
|
+
log(" — sitemap.xml");
|
|
569
|
+
}
|
|
172
570
|
} catch {
|
|
173
|
-
|
|
571
|
+
log(" — sitemap.xml (unreachable)");
|
|
174
572
|
}
|
|
175
573
|
|
|
176
|
-
//
|
|
177
|
-
|
|
178
|
-
|
|
179
|
-
|
|
574
|
+
// ── Step 5: Check existing ARA ──────────────────────────────────────
|
|
575
|
+
let existingAra = false;
|
|
576
|
+
try {
|
|
577
|
+
const araResp = await fetchUrl(`${baseUrl}/.well-known/ara/manifest.json`);
|
|
578
|
+
existingAra = araResp.status === 200;
|
|
579
|
+
log(` ${existingAra ? "⚠ Site already has ARA manifest!" : "— No existing ARA manifest"}`);
|
|
580
|
+
} catch {
|
|
581
|
+
log(" — No existing ARA manifest");
|
|
582
|
+
}
|
|
180
583
|
|
|
181
|
-
|
|
182
|
-
|
|
183
|
-
|
|
184
|
-
description: description,
|
|
185
|
-
...(locale && { locale: [locale] }),
|
|
186
|
-
contact: {
|
|
187
|
-
website: baseUrl,
|
|
188
|
-
},
|
|
189
|
-
...(image && {
|
|
190
|
-
branding: {
|
|
191
|
-
logo: image,
|
|
192
|
-
},
|
|
193
|
-
}),
|
|
194
|
-
},
|
|
584
|
+
// ── Step 6: Build Layer 2 schemas ───────────────────────────────────
|
|
585
|
+
let schemas = {};
|
|
586
|
+
let schemaSource = "none";
|
|
195
587
|
|
|
196
|
-
|
|
197
|
-
|
|
198
|
-
|
|
199
|
-
|
|
200
|
-
|
|
588
|
+
if (maxLayer >= 2) {
|
|
589
|
+
log(`\n ── Layer 2: Schema Detection ──────────────────`);
|
|
590
|
+
|
|
591
|
+
// Priority 1: JSON-LD (highest confidence)
|
|
592
|
+
if (allJsonLd.length > 0) {
|
|
593
|
+
schemas = { ...schemas, ...buildSchemasFromJsonLd(allJsonLd) };
|
|
594
|
+
if (Object.keys(schemas).length > 0) {
|
|
595
|
+
schemaSource = "json-ld";
|
|
596
|
+
log(` ✓ Generated ${Object.keys(schemas).length} schema(s) from JSON-LD`);
|
|
597
|
+
}
|
|
598
|
+
}
|
|
599
|
+
|
|
600
|
+
// Priority 2: Microdata (medium confidence)
|
|
601
|
+
if (allMicrodata.length > 0) {
|
|
602
|
+
const microdataSchemas = buildSchemasFromMicrodata(allMicrodata);
|
|
603
|
+
for (const [name, schema] of Object.entries(microdataSchemas)) {
|
|
604
|
+
if (!schemas[name]) {
|
|
605
|
+
schemas[name] = schema;
|
|
606
|
+
log(` ✓ Generated schema "${name}" from Microdata`);
|
|
607
|
+
}
|
|
608
|
+
}
|
|
609
|
+
if (Object.keys(microdataSchemas).length > 0 && schemaSource === "none") {
|
|
610
|
+
schemaSource = "microdata";
|
|
611
|
+
}
|
|
612
|
+
}
|
|
613
|
+
|
|
614
|
+
// Priority 3: Sitemap patterns (low confidence — resource detection only)
|
|
615
|
+
if (sitemapData && Object.keys(sitemapData.patterns).length > 0) {
|
|
616
|
+
log(` ℹ Sitemap URL patterns detected:`);
|
|
617
|
+
for (const [prefix, data] of Object.entries(sitemapData.patterns)) {
|
|
618
|
+
if (data.count >= 3) {
|
|
619
|
+
log(` /${prefix}/ — ${data.count} pages (e.g. ${data.examples[0]})`);
|
|
620
|
+
}
|
|
621
|
+
}
|
|
622
|
+
}
|
|
623
|
+
|
|
624
|
+
if (Object.keys(schemas).length === 0) {
|
|
625
|
+
log(`\n ⚠ No structured data found for Layer 2 auto-generation.`);
|
|
626
|
+
log(` Layer 2 schemas require at least one of:`);
|
|
627
|
+
log(` - JSON-LD blocks (<script type="application/ld+json">)`);
|
|
628
|
+
log(` - Microdata attributes (itemscope, itemprop)`);
|
|
629
|
+
log(` - OpenAPI/Swagger endpoint`);
|
|
630
|
+
log(` → Generating Layer 1 only. Add schemas manually.`);
|
|
631
|
+
}
|
|
632
|
+
}
|
|
633
|
+
|
|
634
|
+
// ── Step 7: Build manifest ──────────────────────────────────────────
|
|
635
|
+
log(`\n ── Building Manifest ─────────────────────────`);
|
|
636
|
+
|
|
637
|
+
const resources = [];
|
|
638
|
+
|
|
639
|
+
// Add resources from detected schemas
|
|
640
|
+
for (const [name, schema] of Object.entries(schemas)) {
|
|
641
|
+
resources.push({
|
|
642
|
+
id: schema.resource_id,
|
|
643
|
+
type: JSONLD_TYPE_MAP[schema._source_type]?.resource_type || "content",
|
|
644
|
+
label: schema.label,
|
|
645
|
+
...(maxLayer >= 2 && { schema_ref: `schemas/${name}.json` }),
|
|
646
|
+
access: "public",
|
|
647
|
+
freshness: "weekly",
|
|
648
|
+
});
|
|
649
|
+
}
|
|
650
|
+
|
|
651
|
+
// Add resources from sitemap patterns not covered by schemas
|
|
652
|
+
if (sitemapData) {
|
|
653
|
+
const coveredIds = new Set(resources.map((r) => r.id));
|
|
654
|
+
for (const [prefix, data] of Object.entries(sitemapData.patterns)) {
|
|
655
|
+
if (data.count >= 3 && !coveredIds.has(prefix)) {
|
|
656
|
+
resources.push({
|
|
657
|
+
id: prefix,
|
|
201
658
|
type: "content",
|
|
202
|
-
label:
|
|
203
|
-
description:
|
|
659
|
+
label: prefix.charAt(0).toUpperCase() + prefix.slice(1),
|
|
660
|
+
description: `${data.count} pages detected under /${prefix}/`,
|
|
661
|
+
count: data.count,
|
|
204
662
|
access: "public",
|
|
205
663
|
freshness: "weekly",
|
|
206
|
-
|
|
207
|
-
|
|
208
|
-
|
|
664
|
+
_note: "Detected from sitemap URL patterns. Add schema_ref manually for Layer 2.",
|
|
665
|
+
});
|
|
666
|
+
}
|
|
667
|
+
}
|
|
668
|
+
}
|
|
669
|
+
|
|
670
|
+
// Fallback: at least one generic resource
|
|
671
|
+
if (resources.length === 0) {
|
|
672
|
+
resources.push({
|
|
673
|
+
id: "pages",
|
|
674
|
+
type: "content",
|
|
675
|
+
label: "Site Pages",
|
|
676
|
+
description: "Pages available on this website",
|
|
677
|
+
access: "public",
|
|
678
|
+
freshness: "weekly",
|
|
679
|
+
});
|
|
680
|
+
}
|
|
681
|
+
|
|
682
|
+
// Identity enrichment from JSON-LD Organization/LocalBusiness
|
|
683
|
+
const identity = {
|
|
684
|
+
name: title.replace(/\s*[-|–—].*$/, "").trim(),
|
|
685
|
+
type: siteType,
|
|
686
|
+
description: description,
|
|
687
|
+
...(locale && { locale: [locale] }),
|
|
688
|
+
contact: {
|
|
689
|
+
website: baseUrl,
|
|
209
690
|
},
|
|
691
|
+
...(image && { branding: { logo: image } }),
|
|
692
|
+
};
|
|
693
|
+
|
|
694
|
+
for (const ld of allJsonLd) {
|
|
695
|
+
const ldType = ld["@type"];
|
|
696
|
+
if (ldType === "Organization" || ldType === "LocalBusiness" || ldType === "Restaurant") {
|
|
697
|
+
if (ld.name) identity.name = ld.name;
|
|
698
|
+
if (ld.telephone) identity.contact.phone = ld.telephone;
|
|
699
|
+
if (ld.email) identity.contact.email = ld.email;
|
|
700
|
+
if (ld.address) {
|
|
701
|
+
const addr = typeof ld.address === "string" ? ld.address : ld.address.streetAddress || ld.address.name;
|
|
702
|
+
if (addr) identity.geo = { address: addr };
|
|
703
|
+
if (ld.address.addressLocality) {
|
|
704
|
+
identity.geo = identity.geo || {};
|
|
705
|
+
identity.geo.city = ld.address.addressLocality;
|
|
706
|
+
}
|
|
707
|
+
}
|
|
708
|
+
if (ld.geo && ld.geo.latitude) {
|
|
709
|
+
identity.geo = identity.geo || {};
|
|
710
|
+
identity.geo.coordinates = [parseFloat(ld.geo.latitude), parseFloat(ld.geo.longitude)];
|
|
711
|
+
}
|
|
712
|
+
}
|
|
713
|
+
}
|
|
210
714
|
|
|
715
|
+
const manifest = {
|
|
716
|
+
$ara: "1.0",
|
|
717
|
+
$schema: "https://ara-standard.org/schema/manifest/v1",
|
|
718
|
+
identity,
|
|
719
|
+
content_map: {
|
|
720
|
+
summary: `Content from ${domain}${resources.length > 1 ? ` — ${resources.length} resource types detected` : ""}`,
|
|
721
|
+
resources,
|
|
722
|
+
},
|
|
211
723
|
capabilities: {
|
|
212
|
-
protocols: {
|
|
213
|
-
|
|
724
|
+
protocols: {
|
|
725
|
+
...(openApiUrl && { rest_api: { openapi: openApiUrl } }),
|
|
726
|
+
},
|
|
214
727
|
},
|
|
215
|
-
|
|
216
728
|
policies: {
|
|
217
729
|
agent_access: "open",
|
|
218
|
-
rate_limit: {
|
|
219
|
-
requests_per_minute: 30,
|
|
220
|
-
burst: 5,
|
|
221
|
-
},
|
|
730
|
+
rate_limit: { requests_per_minute: 30, burst: 5 },
|
|
222
731
|
data_usage: {
|
|
223
732
|
caching_allowed: true,
|
|
224
733
|
cache_ttl: 3600,
|
|
@@ -226,78 +735,172 @@ async function generate(siteUrl) {
|
|
|
226
735
|
attribution_required: true,
|
|
227
736
|
},
|
|
228
737
|
},
|
|
229
|
-
|
|
230
738
|
meta: {
|
|
231
739
|
generated_at: new Date().toISOString(),
|
|
232
|
-
generator:
|
|
740
|
+
generator: `ara-generator/${VERSION}`,
|
|
741
|
+
generator_layer: Object.keys(schemas).length > 0 ? 2 : 1,
|
|
233
742
|
human_site: baseUrl,
|
|
743
|
+
...(Object.keys(schemas).length > 0 && {
|
|
744
|
+
schemas_generated: Object.keys(schemas).length,
|
|
745
|
+
schema_source: schemaSource,
|
|
746
|
+
}),
|
|
234
747
|
},
|
|
235
748
|
};
|
|
236
749
|
|
|
237
|
-
//
|
|
238
|
-
|
|
239
|
-
|
|
240
|
-
|
|
241
|
-
|
|
242
|
-
|
|
243
|
-
|
|
244
|
-
|
|
245
|
-
|
|
246
|
-
|
|
247
|
-
|
|
248
|
-
|
|
750
|
+
// ── Summary ─────────────────────────────────────────────────────────
|
|
751
|
+
const generatedLayer = Object.keys(schemas).length > 0 ? 2 : 1;
|
|
752
|
+
|
|
753
|
+
log(`\n ── Results ───────────────────────────────────`);
|
|
754
|
+
log(` ✓ Generated ARA manifest (Layer ${generatedLayer})`);
|
|
755
|
+
log(` Identity: ${identity.name} (${siteType})`);
|
|
756
|
+
log(` Resources: ${resources.length}`);
|
|
757
|
+
log(` Schemas: ${Object.keys(schemas).length}`);
|
|
758
|
+
if (openApiUrl) log(` OpenAPI: detected`);
|
|
759
|
+
log(``);
|
|
760
|
+
|
|
761
|
+
if (generatedLayer === 1 && maxLayer >= 2) {
|
|
762
|
+
log(` ── How to enable Layer 2 auto-generation ────`);
|
|
763
|
+
log(` Add structured data to your HTML:`);
|
|
764
|
+
log(``);
|
|
765
|
+
log(` Option A — JSON-LD (recommended):`);
|
|
766
|
+
log(` <script type="application/ld+json">`);
|
|
767
|
+
log(` { "@type": "Product", "name": "...", "price": "..." }`);
|
|
768
|
+
log(` </script>`);
|
|
769
|
+
log(``);
|
|
770
|
+
log(` Option B — Microdata:`);
|
|
771
|
+
log(` <div itemscope itemtype="https://schema.org/Product">`);
|
|
772
|
+
log(` <span itemprop="name">...</span>`);
|
|
773
|
+
log(` </div>`);
|
|
774
|
+
log(``);
|
|
775
|
+
log(` Then re-run: npx ara-generate ${baseUrl}`);
|
|
776
|
+
log(``);
|
|
249
777
|
}
|
|
250
778
|
|
|
251
|
-
|
|
252
|
-
|
|
779
|
+
if (generatedLayer >= 2) {
|
|
780
|
+
log(` ── Layer 2 Notes ────────────────────────────`);
|
|
781
|
+
log(` Schemas were auto-generated from ${schemaSource}.`);
|
|
782
|
+
log(` Confidence: ${schemaSource === "json-ld" ? "HIGH" : "MEDIUM"}`);
|
|
783
|
+
log(` Review generated schemas and:`);
|
|
784
|
+
log(` - Remove properties that don't apply to your site`);
|
|
785
|
+
log(` - Add custom properties specific to your business`);
|
|
786
|
+
log(` - Adjust search_hints (filterable_by, sortable_by)`);
|
|
787
|
+
log(` - Add Layer 3 actions.json manually if needed`);
|
|
788
|
+
log(``);
|
|
789
|
+
}
|
|
253
790
|
|
|
254
|
-
return manifest;
|
|
791
|
+
return { manifest, schemas };
|
|
255
792
|
}
|
|
256
793
|
|
|
257
|
-
// ── CLI
|
|
794
|
+
// ── CLI ───────────────────────────────────────────────────────────────────
|
|
258
795
|
|
|
259
796
|
async function main() {
|
|
260
797
|
const args = process.argv.slice(2);
|
|
261
798
|
const url = args.find((a) => a.startsWith("http"));
|
|
262
799
|
const outputIdx = args.indexOf("--output");
|
|
263
|
-
const
|
|
800
|
+
const outputDir = outputIdx !== -1 ? args[outputIdx + 1] : null;
|
|
801
|
+
const layerIdx = args.indexOf("--layer");
|
|
802
|
+
const maxLayer = layerIdx !== -1 ? parseInt(args[layerIdx + 1], 10) : 2;
|
|
803
|
+
const crawlIdx = args.indexOf("--crawl");
|
|
804
|
+
const crawlPages = crawlIdx !== -1 ? parseInt(args[crawlIdx + 1], 10) : 0;
|
|
264
805
|
|
|
265
806
|
if (!url || args.includes("--help") || args.includes("-h")) {
|
|
266
807
|
console.log(`
|
|
267
|
-
ARA Generator
|
|
268
|
-
|
|
808
|
+
ARA Generator v${VERSION}
|
|
809
|
+
${"=".repeat(20 + VERSION.length)}
|
|
269
810
|
|
|
270
|
-
Generates
|
|
811
|
+
Generates ARA manifest + schemas from site metadata.
|
|
812
|
+
Zero dependencies. Uses JSON-LD, Microdata, OpenGraph, and sitemap analysis.
|
|
271
813
|
|
|
272
814
|
Usage:
|
|
273
815
|
npx ara-generate <url>
|
|
274
|
-
npx ara-generate <url> --output
|
|
816
|
+
npx ara-generate <url> --output .well-known/ara/
|
|
817
|
+
npx ara-generate <url> --layer 1 (manifest only, skip schemas)
|
|
818
|
+
npx ara-generate <url> --layer 2 (manifest + schemas, default)
|
|
819
|
+
npx ara-generate <url> --crawl 5 (crawl up to 5 internal pages)
|
|
820
|
+
|
|
821
|
+
Options:
|
|
822
|
+
--output <dir> Write files to directory (creates manifest.json + schemas/)
|
|
823
|
+
--layer <n> Max generation layer: 1 = manifest only, 2 = + schemas
|
|
824
|
+
--crawl <n> Crawl N internal pages to find more structured data
|
|
825
|
+
|
|
826
|
+
Layer 2 schema auto-generation works when the site has:
|
|
827
|
+
✓ JSON-LD structured data (<script type="application/ld+json">) → HIGH confidence
|
|
828
|
+
✓ Microdata attributes (itemscope, itemprop) → MEDIUM confidence
|
|
829
|
+
✓ OpenAPI/Swagger endpoint (auto-detected) → protocol detection
|
|
830
|
+
|
|
831
|
+
Layer 2 CANNOT be auto-generated when:
|
|
832
|
+
✗ Site has no structured data (plain HTML only)
|
|
833
|
+
✗ Content is rendered client-side only (SPA without SSR)
|
|
834
|
+
✗ Data is behind authentication
|
|
835
|
+
|
|
836
|
+
Supported Schema.org types:
|
|
837
|
+
Product, Article, NewsArticle, BlogPosting, Recipe, Event,
|
|
838
|
+
Restaurant, LocalBusiness, Course
|
|
275
839
|
|
|
276
840
|
Examples:
|
|
277
|
-
npx ara-generate https://
|
|
278
|
-
npx ara-generate https://
|
|
279
|
-
|
|
280
|
-
The generated manifest is a Level 1 starting point.
|
|
281
|
-
Add schemas (Layer 2) and actions (Layer 3) manually for full ARA support.
|
|
841
|
+
npx ara-generate https://myshop.com
|
|
842
|
+
npx ara-generate https://myblog.com --output .well-known/ara/
|
|
843
|
+
npx ara-generate https://mysite.com --crawl 10 --output .well-known/ara/
|
|
282
844
|
`);
|
|
283
845
|
process.exit(0);
|
|
284
846
|
}
|
|
285
847
|
|
|
286
|
-
const manifest = await generate(url);
|
|
287
|
-
const json = JSON.stringify(manifest, null, 2);
|
|
848
|
+
const { manifest, schemas } = await generate(url, { maxLayer, crawlPages });
|
|
288
849
|
|
|
289
|
-
if (
|
|
290
|
-
|
|
850
|
+
if (outputDir) {
|
|
851
|
+
// Write manifest + schemas to directory
|
|
852
|
+
const dir = outputDir.replace(/\/$/, "");
|
|
291
853
|
if (!fs.existsSync(dir)) fs.mkdirSync(dir, { recursive: true });
|
|
292
|
-
|
|
293
|
-
|
|
854
|
+
|
|
855
|
+
fs.writeFileSync(path.join(dir, "manifest.json"), JSON.stringify(manifest, null, 2));
|
|
856
|
+
console.error(` ✓ Saved ${dir}/manifest.json`);
|
|
857
|
+
|
|
858
|
+
if (Object.keys(schemas).length > 0) {
|
|
859
|
+
const schemasDir = path.join(dir, "schemas");
|
|
860
|
+
if (!fs.existsSync(schemasDir)) fs.mkdirSync(schemasDir, { recursive: true });
|
|
861
|
+
|
|
862
|
+
for (const [name, schema] of Object.entries(schemas)) {
|
|
863
|
+
// Clean internal fields before writing
|
|
864
|
+
const cleanSchema = { ...schema };
|
|
865
|
+
delete cleanSchema._source;
|
|
866
|
+
delete cleanSchema._source_type;
|
|
867
|
+
delete cleanSchema._confidence;
|
|
868
|
+
delete cleanSchema._note;
|
|
869
|
+
|
|
870
|
+
// Add note as a proper field
|
|
871
|
+
cleanSchema._generator_note =
|
|
872
|
+
schema._note + ` Source: ${schema._source} (${schema._confidence} confidence).`;
|
|
873
|
+
|
|
874
|
+
// Clean auto-detected markers from properties
|
|
875
|
+
for (const prop of Object.values(cleanSchema.properties)) {
|
|
876
|
+
delete prop._auto_detected;
|
|
877
|
+
}
|
|
878
|
+
|
|
879
|
+
fs.writeFileSync(path.join(schemasDir, `${name}.json`), JSON.stringify(cleanSchema, null, 2));
|
|
880
|
+
console.error(` ✓ Saved ${dir}/schemas/${name}.json`);
|
|
881
|
+
}
|
|
882
|
+
}
|
|
294
883
|
} else {
|
|
295
|
-
|
|
884
|
+
// Output manifest to stdout (schemas info embedded in meta)
|
|
885
|
+
if (Object.keys(schemas).length > 0) {
|
|
886
|
+
manifest._generated_schemas = {};
|
|
887
|
+
for (const [name, schema] of Object.entries(schemas)) {
|
|
888
|
+
const clean = { ...schema };
|
|
889
|
+
delete clean._source;
|
|
890
|
+
delete clean._source_type;
|
|
891
|
+
delete clean._confidence;
|
|
892
|
+
for (const prop of Object.values(clean.properties)) {
|
|
893
|
+
delete prop._auto_detected;
|
|
894
|
+
}
|
|
895
|
+
manifest._generated_schemas[name] = clean;
|
|
896
|
+
}
|
|
897
|
+
}
|
|
898
|
+
console.log(JSON.stringify(manifest, null, 2));
|
|
296
899
|
}
|
|
297
900
|
}
|
|
298
901
|
|
|
299
902
|
main().catch((err) => {
|
|
300
|
-
console.error(
|
|
903
|
+
console.error(`\n ✗ Error: ${err.message}\n`);
|
|
301
904
|
process.exit(1);
|
|
302
905
|
});
|
|
303
906
|
|
package/package.json
CHANGED
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
{
|
|
2
2
|
"name": "ara-generate",
|
|
3
|
-
"version": "
|
|
4
|
-
"description": "Generates ARA manifests from existing website metadata",
|
|
3
|
+
"version": "2.0.0",
|
|
4
|
+
"description": "Generates ARA manifests and schemas from existing website metadata (JSON-LD, Microdata, OpenGraph, sitemap)",
|
|
5
5
|
"main": "index.js",
|
|
6
6
|
"bin": {
|
|
7
7
|
"ara-generate": "./index.js"
|
|
@@ -13,7 +13,11 @@
|
|
|
13
13
|
"ai-agents",
|
|
14
14
|
"manifest",
|
|
15
15
|
"generator",
|
|
16
|
-
"
|
|
16
|
+
"schema",
|
|
17
|
+
"json-ld",
|
|
18
|
+
"microdata",
|
|
19
|
+
"mcp",
|
|
20
|
+
"a2a"
|
|
17
21
|
],
|
|
18
22
|
"author": "ARA Standard Contributors",
|
|
19
23
|
"license": "MIT",
|
|
@@ -24,5 +28,9 @@
|
|
|
24
28
|
"homepage": "https://ara-standard.org",
|
|
25
29
|
"engines": {
|
|
26
30
|
"node": ">=16.0.0"
|
|
27
|
-
}
|
|
31
|
+
},
|
|
32
|
+
"files": [
|
|
33
|
+
"index.js",
|
|
34
|
+
"README.md"
|
|
35
|
+
]
|
|
28
36
|
}
|