@braedenbuilds/crawl-sim 1.2.0 → 1.3.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/.claude-plugin/marketplace.json +1 -1
- package/.claude-plugin/plugin.json +1 -1
- package/package.json +1 -1
- package/skills/crawl-sim/SKILL.md +24 -3
- package/skills/crawl-sim/scripts/_lib.sh +6 -1
- package/skills/crawl-sim/scripts/build-report.sh +45 -0
- package/skills/crawl-sim/scripts/check-llmstxt.sh +5 -0
- package/skills/crawl-sim/scripts/check-sitemap.sh +10 -1
- package/skills/crawl-sim/scripts/compute-score.sh +59 -37
- package/skills/crawl-sim/scripts/extract-links.sh +5 -7
package/package.json
CHANGED
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
{
|
|
2
2
|
"name": "@braedenbuilds/crawl-sim",
|
|
3
|
-
"version": "1.
|
|
3
|
+
"version": "1.3.1",
|
|
4
4
|
"description": "Agent-native multi-bot web crawler simulator. See your site through the eyes of Googlebot, GPTBot, ClaudeBot, and PerplexityBot.",
|
|
5
5
|
"bin": {
|
|
6
6
|
"crawl-sim": "bin/install.js"
|
|
@@ -51,7 +51,7 @@ Split the work into **five Bash invocations**, each with a clear `description` f
|
|
|
51
51
|
|
|
52
52
|
### Stage 1 — Fetch
|
|
53
53
|
|
|
54
|
-
Tell the user: "Fetching as Googlebot, GPTBot, ClaudeBot, and PerplexityBot..."
|
|
54
|
+
Tell the user: "Fetching as Googlebot, GPTBot, ClaudeBot, and PerplexityBot in parallel..."
|
|
55
55
|
|
|
56
56
|
```bash
|
|
57
57
|
# Resolve skill directory
|
|
@@ -67,7 +67,16 @@ fi
|
|
|
67
67
|
RUN_DIR=$(mktemp -d -t crawl-sim.XXXXXX)
|
|
68
68
|
URL="<user-provided-url>"
|
|
69
69
|
for bot in googlebot gptbot claudebot perplexitybot; do
|
|
70
|
-
"$SKILL_DIR/scripts/fetch-as-bot.sh" "$URL" "$SKILL_DIR/profiles/${bot}.json" > "$RUN_DIR/fetch-${bot}.json"
|
|
70
|
+
"$SKILL_DIR/scripts/fetch-as-bot.sh" "$URL" "$SKILL_DIR/profiles/${bot}.json" > "$RUN_DIR/fetch-${bot}.json" &
|
|
71
|
+
done
|
|
72
|
+
wait
|
|
73
|
+
|
|
74
|
+
# Verify no empty fetch files (guard against silent parallel failures)
|
|
75
|
+
for bot in googlebot gptbot claudebot perplexitybot; do
|
|
76
|
+
if [ ! -s "$RUN_DIR/fetch-${bot}.json" ]; then
|
|
77
|
+
echo "WARNING: fetch-${bot}.json is empty — retrying serially" >&2
|
|
78
|
+
"$SKILL_DIR/scripts/fetch-as-bot.sh" "$URL" "$SKILL_DIR/profiles/${bot}.json" > "$RUN_DIR/fetch-${bot}.json"
|
|
79
|
+
fi
|
|
71
80
|
done
|
|
72
81
|
```
|
|
73
82
|
|
|
@@ -124,7 +133,7 @@ Tell the user: "Computing per-bot scores and finalizing the report..."
|
|
|
124
133
|
|
|
125
134
|
```bash
|
|
126
135
|
"$SKILL_DIR/scripts/compute-score.sh" "$RUN_DIR" > "$RUN_DIR/score.json"
|
|
127
|
-
|
|
136
|
+
"$SKILL_DIR/scripts/build-report.sh" "$RUN_DIR" > ./crawl-sim-report.json
|
|
128
137
|
```
|
|
129
138
|
|
|
130
139
|
**Page-type awareness.** `compute-score.sh` derives a page type from the target URL (`root` / `detail` / `archive` / `faq` / `about` / `contact` / `generic`) and picks a schema rubric accordingly. Root pages are expected to ship `Organization` + `WebSite` — penalizing them for missing `BreadcrumbList` or `FAQPage` would be wrong, so the scorer doesn't. If the URL heuristic picks the wrong type (e.g., a homepage at `/en/` that URL-parses as generic), pass `--page-type <type>`:
|
|
@@ -162,6 +171,18 @@ Print a boxed score card to the terminal:
|
|
|
162
171
|
|
|
163
172
|
Progress bars are 20 chars wide using `█` and `░` (each char = 5%).
|
|
164
173
|
|
|
174
|
+
**Parity-aware display.** When `parity.score >= 95` AND all per-bot composite scores are within 5 points of each other, collapse the four bot rows into one:
|
|
175
|
+
|
|
176
|
+
```
|
|
177
|
+
║ All 4 bots 98 A ███████████████████░ (parity: content identical) ║
|
|
178
|
+
```
|
|
179
|
+
|
|
180
|
+
Only show individual bot rows when scores diverge — that's when per-bot detail adds information. Always show the parity line in the category breakdown:
|
|
181
|
+
|
|
182
|
+
```
|
|
183
|
+
║ Content Parity 100 A (all bots see the same content) ║
|
|
184
|
+
```
|
|
185
|
+
|
|
165
186
|
## Output Layer 2 — Narrative Audit
|
|
166
187
|
|
|
167
188
|
Lead with a **Bot differentiation summary** — state up front whether the bots scored the same or differently, and why. If they scored the same, explicitly say so:
|
|
@@ -73,9 +73,14 @@ page_type_for_url() {
|
|
|
73
73
|
|
|
74
74
|
# Fetch a URL to a local file and return the HTTP status code on stdout.
|
|
75
75
|
# Usage: status=$(fetch_to_file <url> <output-file> [timeout-seconds])
|
|
76
|
+
# Retries once on transient failure (same SSL/DNS flake that caused #11).
|
|
76
77
|
fetch_to_file() {
|
|
77
78
|
local url="$1"
|
|
78
79
|
local out="$2"
|
|
79
80
|
local timeout="${3:-15}"
|
|
80
|
-
|
|
81
|
+
local status
|
|
82
|
+
status=$(curl -sS -L -o "$out" -w '%{http_code}' --max-time "$timeout" "$url" 2>/dev/null) && echo "$status" && return
|
|
83
|
+
# Retry once on transient failure
|
|
84
|
+
status=$(curl -sS -L -o "$out" -w '%{http_code}' --max-time "$timeout" "$url" 2>/dev/null) && echo "$status" && return
|
|
85
|
+
echo "000"
|
|
81
86
|
}
|
|
@@ -0,0 +1,45 @@
|
|
|
1
|
+
#!/usr/bin/env bash
|
|
2
|
+
set -eu
|
|
3
|
+
|
|
4
|
+
# build-report.sh — Consolidate all crawl-sim outputs into a single JSON report
|
|
5
|
+
# Usage: build-report.sh <results-dir>
|
|
6
|
+
# Output: JSON to stdout
|
|
7
|
+
|
|
8
|
+
RESULTS_DIR="${1:?Usage: build-report.sh <results-dir>}"
|
|
9
|
+
|
|
10
|
+
if [ ! -f "$RESULTS_DIR/score.json" ]; then
|
|
11
|
+
echo "Error: score.json not found in $RESULTS_DIR — run compute-score.sh first" >&2
|
|
12
|
+
exit 1
|
|
13
|
+
fi
|
|
14
|
+
|
|
15
|
+
SCORE=$(cat "$RESULTS_DIR/score.json")
|
|
16
|
+
|
|
17
|
+
# Collect per-bot raw data
|
|
18
|
+
PER_BOT="{}"
|
|
19
|
+
for f in "$RESULTS_DIR"/fetch-*.json; do
|
|
20
|
+
[ -f "$f" ] || continue
|
|
21
|
+
bot_id=$(basename "$f" .json | sed 's/^fetch-//')
|
|
22
|
+
|
|
23
|
+
BOT_RAW=$(jq -n \
|
|
24
|
+
--argjson fetch "$(jq '{status, timing, size, wordCount, redirectCount, finalUrl, redirectChain, fetchFailed, error}' "$f" 2>/dev/null || echo '{}')" \
|
|
25
|
+
--argjson meta "$(jq '.' "$RESULTS_DIR/meta-$bot_id.json" 2>/dev/null || echo '{}')" \
|
|
26
|
+
--argjson jsonld "$(jq '{blockCount, types, blocks}' "$RESULTS_DIR/jsonld-$bot_id.json" 2>/dev/null || echo '{}')" \
|
|
27
|
+
--argjson links "$(jq '.' "$RESULTS_DIR/links-$bot_id.json" 2>/dev/null || echo '{}')" \
|
|
28
|
+
--argjson robots "$(jq '.' "$RESULTS_DIR/robots-$bot_id.json" 2>/dev/null || echo '{}')" \
|
|
29
|
+
'{fetch: $fetch, meta: $meta, jsonld: $jsonld, links: $links, robots: $robots}')
|
|
30
|
+
|
|
31
|
+
PER_BOT=$(printf '%s' "$PER_BOT" | jq --argjson raw "$BOT_RAW" --arg id "$bot_id" '.[$id] = $raw')
|
|
32
|
+
done
|
|
33
|
+
|
|
34
|
+
# Collect independent (non-per-bot) data
|
|
35
|
+
INDEPENDENT=$(jq -n \
|
|
36
|
+
--argjson sitemap "$(jq '.' "$RESULTS_DIR/sitemap.json" 2>/dev/null || echo '{}')" \
|
|
37
|
+
--argjson llmstxt "$(jq '.' "$RESULTS_DIR/llmstxt.json" 2>/dev/null || echo '{}')" \
|
|
38
|
+
--argjson diffRender "$(jq '.' "$RESULTS_DIR/diff-render.json" 2>/dev/null || echo '{"skipped":true,"reason":"not_found"}')" \
|
|
39
|
+
'{sitemap: $sitemap, llmstxt: $llmstxt, diffRender: $diffRender}')
|
|
40
|
+
|
|
41
|
+
# Merge score + raw data
|
|
42
|
+
printf '%s' "$SCORE" | jq \
|
|
43
|
+
--argjson perBot "$PER_BOT" \
|
|
44
|
+
--argjson independent "$INDEPENDENT" \
|
|
45
|
+
'. + {raw: {perBot: $perBot, independent: $independent}}'
|
|
@@ -79,8 +79,12 @@ LLMS_FULL_HAS_TITLE=$HAS_TITLE
|
|
|
79
79
|
LLMS_FULL_HAS_DESC=$HAS_DESCRIPTION
|
|
80
80
|
LLMS_FULL_URLS=$URL_COUNT
|
|
81
81
|
|
|
82
|
+
TOP_EXISTS=false
|
|
83
|
+
[ "$LLMS_EXISTS" = "true" ] || [ "$LLMS_FULL_EXISTS" = "true" ] && TOP_EXISTS=true
|
|
84
|
+
|
|
82
85
|
jq -n \
|
|
83
86
|
--arg url "$URL" \
|
|
87
|
+
--argjson topExists "$TOP_EXISTS" \
|
|
84
88
|
--arg llmsUrl "${ORIGIN}/llms.txt" \
|
|
85
89
|
--arg llmsFullUrl "${ORIGIN}/llms-full.txt" \
|
|
86
90
|
--argjson llmsExists "$LLMS_EXISTS" \
|
|
@@ -96,6 +100,7 @@ jq -n \
|
|
|
96
100
|
--argjson llmsFullUrls "$LLMS_FULL_URLS" \
|
|
97
101
|
'{
|
|
98
102
|
url: $url,
|
|
103
|
+
exists: $topExists,
|
|
99
104
|
llmsTxt: {
|
|
100
105
|
url: $llmsUrl,
|
|
101
106
|
exists: $llmsExists,
|
|
@@ -25,6 +25,7 @@ CONTAINS_TARGET=false
|
|
|
25
25
|
HAS_LASTMOD=false
|
|
26
26
|
IS_INDEX=false
|
|
27
27
|
CHILD_SITEMAP_COUNT=0
|
|
28
|
+
SAMPLE_URLS="[]"
|
|
28
29
|
|
|
29
30
|
if [ "$HTTP_STATUS" = "200" ] && [ -s "$SITEMAP_FILE" ]; then
|
|
30
31
|
# Check if content looks like XML (not HTML fallback)
|
|
@@ -43,6 +44,12 @@ if [ "$HTTP_STATUS" = "200" ] && [ -s "$SITEMAP_FILE" ]; then
|
|
|
43
44
|
# Count <loc> tags (URLs, or child sitemaps in an index)
|
|
44
45
|
URL_COUNT=$(grep -oE '<loc>' "$SITEMAP_FILE" | wc -l | tr -d ' ')
|
|
45
46
|
|
|
47
|
+
# Extract first 10 <loc> URLs as sample
|
|
48
|
+
SAMPLE_URLS=$(grep -oE '<loc>[^<]+</loc>' "$SITEMAP_FILE" \
|
|
49
|
+
| sed -E 's/<\/?loc>//g' \
|
|
50
|
+
| head -10 \
|
|
51
|
+
| jq -R . | jq -s .)
|
|
52
|
+
|
|
46
53
|
# Check if target URL appears anywhere in the sitemap
|
|
47
54
|
# Match both with and without trailing slash
|
|
48
55
|
URL_NO_TRAILING=$(printf '%s' "$URL" | sed -E 's#/$##')
|
|
@@ -67,6 +74,7 @@ jq -n \
|
|
|
67
74
|
--argjson childSitemapCount "$CHILD_SITEMAP_COUNT" \
|
|
68
75
|
--argjson containsTarget "$CONTAINS_TARGET" \
|
|
69
76
|
--argjson hasLastmod "$HAS_LASTMOD" \
|
|
77
|
+
--argjson sampleUrls "$SAMPLE_URLS" \
|
|
70
78
|
'{
|
|
71
79
|
url: $url,
|
|
72
80
|
sitemapUrl: $sitemapUrl,
|
|
@@ -75,5 +83,6 @@ jq -n \
|
|
|
75
83
|
urlCount: $urlCount,
|
|
76
84
|
childSitemapCount: $childSitemapCount,
|
|
77
85
|
containsTarget: $containsTarget,
|
|
78
|
-
hasLastmod: $hasLastmod
|
|
86
|
+
hasLastmod: $hasLastmod,
|
|
87
|
+
sampleUrls: $sampleUrls
|
|
79
88
|
}'
|
|
@@ -312,12 +312,16 @@ for bot_id in $BOTS; do
|
|
|
312
312
|
continue
|
|
313
313
|
fi
|
|
314
314
|
|
|
315
|
-
|
|
316
|
-
TOTAL_TIME
|
|
317
|
-
|
|
318
|
-
|
|
315
|
+
# Batch-read fields from fetch file (1 jq call instead of 4)
|
|
316
|
+
read -r STATUS TOTAL_TIME SERVER_WORD_COUNT RENDERS_JS <<< \
|
|
317
|
+
"$(jq -r '[
|
|
318
|
+
(.status // 0),
|
|
319
|
+
(.timing.total // 0),
|
|
320
|
+
(.wordCount // 0),
|
|
321
|
+
(.bot.rendersJavaScript | if . == null then "unknown" else tostring end)
|
|
322
|
+
] | @tsv' "$FETCH" 2>/dev/null || echo "0 0 0 unknown")"
|
|
319
323
|
|
|
320
|
-
ROBOTS_ALLOWED=$(
|
|
324
|
+
ROBOTS_ALLOWED=$(jq -r '.allowed // false | tostring' "$ROBOTS" 2>/dev/null || echo "false")
|
|
321
325
|
|
|
322
326
|
EFFECTIVE_WORD_COUNT=$SERVER_WORD_COUNT
|
|
323
327
|
HYDRATION_PENALTY=0
|
|
@@ -341,10 +345,15 @@ for bot_id in $BOTS; do
|
|
|
341
345
|
|
|
342
346
|
# --- Category 1: Accessibility (0-100) ---
|
|
343
347
|
ACC=0
|
|
344
|
-
[ "$ROBOTS_ALLOWED"
|
|
345
|
-
|
|
346
|
-
|
|
347
|
-
|
|
348
|
+
if [ "$ROBOTS_ALLOWED" != "true" ]; then
|
|
349
|
+
# R4 critical-fail: robots blocking overrides accessibility to 0/F
|
|
350
|
+
ACC=0
|
|
351
|
+
else
|
|
352
|
+
ACC=$((ACC + 40))
|
|
353
|
+
[ "$STATUS" = "200" ] && ACC=$((ACC + 40))
|
|
354
|
+
TIME_SCORE=$(awk -v t="$TOTAL_TIME" 'BEGIN { if (t < 2) print 20; else if (t < 5) print 10; else print 0 }')
|
|
355
|
+
ACC=$((ACC + TIME_SCORE))
|
|
356
|
+
fi
|
|
348
357
|
|
|
349
358
|
# --- Category 2: Content Visibility (0-100) ---
|
|
350
359
|
CONTENT=0
|
|
@@ -353,18 +362,23 @@ for bot_id in $BOTS; do
|
|
|
353
362
|
elif [ "$EFFECTIVE_WORD_COUNT" -ge 50 ]; then CONTENT=$((CONTENT + 10))
|
|
354
363
|
fi
|
|
355
364
|
|
|
356
|
-
|
|
357
|
-
H2_COUNT
|
|
365
|
+
# Batch-read fields from meta + links (1 jq call instead of 4 + 1)
|
|
366
|
+
read -r H1_COUNT H2_COUNT IMG_TOTAL IMG_WITH_ALT <<< \
|
|
367
|
+
"$(jq -r '[
|
|
368
|
+
(.headings.h1.count // 0),
|
|
369
|
+
(.headings.h2.count // 0),
|
|
370
|
+
(.images.total // 0),
|
|
371
|
+
(.images.withAlt // 0)
|
|
372
|
+
] | @tsv' "$META" 2>/dev/null || echo "0 0 0 0")"
|
|
373
|
+
|
|
374
|
+
INTERNAL_LINKS=$(jq -r 'if (.internal | type) == "number" then .internal else .counts.internal // 0 end' "$LINKS" 2>/dev/null || echo "0")
|
|
375
|
+
|
|
358
376
|
[ "$H1_COUNT" -ge 1 ] && CONTENT=$((CONTENT + 20))
|
|
359
377
|
[ "$H2_COUNT" -ge 1 ] && CONTENT=$((CONTENT + 15))
|
|
360
378
|
|
|
361
|
-
INTERNAL_LINKS=$(jget_num "$LINKS" '.counts.internal')
|
|
362
379
|
if [ "$INTERNAL_LINKS" -ge 5 ]; then CONTENT=$((CONTENT + 20))
|
|
363
380
|
elif [ "$INTERNAL_LINKS" -ge 1 ]; then CONTENT=$((CONTENT + 10))
|
|
364
381
|
fi
|
|
365
|
-
|
|
366
|
-
IMG_TOTAL=$(jget_num "$META" '.images.total')
|
|
367
|
-
IMG_WITH_ALT=$(jget_num "$META" '.images.withAlt')
|
|
368
382
|
if [ "$IMG_TOTAL" -eq 0 ]; then
|
|
369
383
|
CONTENT=$((CONTENT + 15))
|
|
370
384
|
else
|
|
@@ -430,7 +444,7 @@ for bot_id in $BOTS; do
|
|
|
430
444
|
if ! list_contains "$field" $BLOCK_FIELDS; then
|
|
431
445
|
FIELD_VIOLATIONS_JSON=$(printf '%s' "$FIELD_VIOLATIONS_JSON" | jq \
|
|
432
446
|
--arg schema "$BLOCK_TYPE" --arg field "$field" \
|
|
433
|
-
'. + [{kind: "missing_required_field", schema: $schema, field: $field, impact: -5}]')
|
|
447
|
+
'. + [{kind: "missing_required_field", schema: $schema, field: $field, impact: -5, confidence: "high"}]')
|
|
434
448
|
FIELD_PENALTY=$((FIELD_PENALTY + 5))
|
|
435
449
|
fi
|
|
436
450
|
done
|
|
@@ -493,9 +507,9 @@ for bot_id in $BOTS; do
|
|
|
493
507
|
missing: ($missingList | to_arr),
|
|
494
508
|
extras: ($extrasList | to_arr),
|
|
495
509
|
violations: (
|
|
496
|
-
($forbiddenPresent | to_arr | map({kind: "forbidden_schema", schema: ., impact: -10}))
|
|
510
|
+
($forbiddenPresent | to_arr | map({kind: "forbidden_schema", schema: ., impact: -10, confidence: "high"}))
|
|
497
511
|
+ (if $validPenalty > 0
|
|
498
|
-
then [{kind: "invalid_jsonld", count: $invalidCount, impact: (0 - $validPenalty)}]
|
|
512
|
+
then [{kind: "invalid_jsonld", count: $invalidCount, impact: (0 - $validPenalty), confidence: "high"}]
|
|
499
513
|
else []
|
|
500
514
|
end)
|
|
501
515
|
+ $fieldViolations
|
|
@@ -507,20 +521,24 @@ for bot_id in $BOTS; do
|
|
|
507
521
|
|
|
508
522
|
# --- Category 4: Technical Signals (0-100) ---
|
|
509
523
|
TECHNICAL=0
|
|
510
|
-
|
|
511
|
-
|
|
512
|
-
|
|
513
|
-
|
|
514
|
-
|
|
515
|
-
|
|
516
|
-
|
|
517
|
-
|
|
518
|
-
|
|
519
|
-
|
|
520
|
-
|
|
521
|
-
|
|
522
|
-
|
|
523
|
-
|
|
524
|
+
# Batch-read meta fields for technical scoring (1 jq call instead of 5)
|
|
525
|
+
IFS=$'\t' read -r TITLE DESCRIPTION CANONICAL OG_TITLE OG_DESC <<< \
|
|
526
|
+
"$(jq -r '[
|
|
527
|
+
(.title // "" | gsub("\t"; " ")),
|
|
528
|
+
(.description // "" | gsub("\t"; " ")),
|
|
529
|
+
(.canonical // "" | gsub("\t"; " ")),
|
|
530
|
+
(.og.title // "" | gsub("\t"; " ")),
|
|
531
|
+
(.og.description // "" | gsub("\t"; " "))
|
|
532
|
+
] | @tsv' "$META" 2>/dev/null || printf '\t\t\t\t')"
|
|
533
|
+
|
|
534
|
+
[ -n "$TITLE" ] && TECHNICAL=$((TECHNICAL + 25))
|
|
535
|
+
[ -n "$DESCRIPTION" ] && TECHNICAL=$((TECHNICAL + 25))
|
|
536
|
+
[ -n "$CANONICAL" ] && TECHNICAL=$((TECHNICAL + 20))
|
|
537
|
+
[ -n "$OG_TITLE" ] && TECHNICAL=$((TECHNICAL + 8))
|
|
538
|
+
[ -n "$OG_DESC" ] && TECHNICAL=$((TECHNICAL + 7))
|
|
539
|
+
|
|
540
|
+
SITEMAP_EXISTS=$(jq -r '.exists // false | tostring' "$SITEMAP_FILE" 2>/dev/null || echo "false")
|
|
541
|
+
SITEMAP_CONTAINS=$(jq -r '.containsTarget // false | tostring' "$SITEMAP_FILE" 2>/dev/null || echo "false")
|
|
524
542
|
if [ "$SITEMAP_EXISTS" = "true" ] && [ "$SITEMAP_CONTAINS" = "true" ]; then
|
|
525
543
|
TECHNICAL=$((TECHNICAL + 15))
|
|
526
544
|
elif [ "$SITEMAP_EXISTS" = "true" ]; then
|
|
@@ -529,10 +547,14 @@ for bot_id in $BOTS; do
|
|
|
529
547
|
|
|
530
548
|
# --- Category 5: AI Readiness (0-100) ---
|
|
531
549
|
AI=0
|
|
532
|
-
|
|
533
|
-
LLMS_HAS_TITLE
|
|
534
|
-
|
|
535
|
-
|
|
550
|
+
# Batch-read llmstxt fields — use top-level exists (M1) which covers both variants
|
|
551
|
+
read -r LLMS_EXISTS LLMS_HAS_TITLE LLMS_HAS_DESC LLMS_URLS <<< \
|
|
552
|
+
"$(jq -r '[
|
|
553
|
+
(.exists // (.llmsTxt.exists or .llmsFullTxt.exists) | tostring),
|
|
554
|
+
((.llmsTxt.hasTitle // .llmsFullTxt.hasTitle // false) | tostring),
|
|
555
|
+
((.llmsTxt.hasDescription // .llmsFullTxt.hasDescription // false) | tostring),
|
|
556
|
+
((.llmsTxt.urlCount // 0) + (.llmsFullTxt.urlCount // 0))
|
|
557
|
+
] | @tsv' "$LLMSTXT_FILE" 2>/dev/null || echo "false false false 0")"
|
|
536
558
|
|
|
537
559
|
if [ "$LLMS_EXISTS" = "true" ]; then
|
|
538
560
|
AI=$((AI + 40))
|
|
@@ -541,7 +563,7 @@ for bot_id in $BOTS; do
|
|
|
541
563
|
[ "$LLMS_URLS" -ge 1 ] && AI=$((AI + 6))
|
|
542
564
|
fi
|
|
543
565
|
[ "$EFFECTIVE_WORD_COUNT" -ge 200 ] && AI=$((AI + 20))
|
|
544
|
-
if [ "$H1_COUNT" -ge 1 ] && [ -n "$DESCRIPTION" ]
|
|
566
|
+
if [ "$H1_COUNT" -ge 1 ] && [ -n "$DESCRIPTION" ]; then
|
|
545
567
|
AI=$((AI + 20))
|
|
546
568
|
fi
|
|
547
569
|
|
|
@@ -93,11 +93,9 @@ jq -n \
|
|
|
93
93
|
--argjson internalSample "$INTERNAL_SAMPLE" \
|
|
94
94
|
--argjson externalSample "$EXTERNAL_SAMPLE" \
|
|
95
95
|
'{
|
|
96
|
-
|
|
97
|
-
|
|
98
|
-
|
|
99
|
-
|
|
100
|
-
|
|
101
|
-
internal: $internalSample,
|
|
102
|
-
external: $externalSample
|
|
96
|
+
total: ($internalCount + $externalCount),
|
|
97
|
+
internal: $internalCount,
|
|
98
|
+
external: $externalCount,
|
|
99
|
+
internalUrls: $internalSample,
|
|
100
|
+
externalUrls: $externalSample
|
|
103
101
|
}'
|