jekyll-ai-visible-content 0.1.0 → 0.4.7
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/.gitignore +3 -1
- data/CHANGELOG.md +66 -1
- data/Gemfile.lock +1 -1
- data/README.md +87 -7
- data/jekyll-ai-visible-content.gemspec +1 -1
- data/lib/jekyll-ai-visible-content.rb +4 -0
- data/lib/jekyll_ai_visible_content/configuration.rb +20 -2
- data/lib/jekyll_ai_visible_content/content_filter.rb +91 -0
- data/lib/jekyll_ai_visible_content/entity_classifier.rb +122 -0
- data/lib/jekyll_ai_visible_content/generators/ai_resource_generator.rb +232 -0
- data/lib/jekyll_ai_visible_content/generators/content_graph_generator.rb +68 -20
- data/lib/jekyll_ai_visible_content/generators/entity_map_generator.rb +2 -2
- data/lib/jekyll_ai_visible_content/generators/robots_txt_generator.rb +14 -0
- data/lib/jekyll_ai_visible_content/hooks/post_render_hook.rb +170 -9
- data/lib/jekyll_ai_visible_content/hooks/validate_hook.rb +71 -5
- data/lib/jekyll_ai_visible_content/tags/ai_resource_links_tag.rb +37 -0
- data/lib/jekyll_ai_visible_content/validators/entity_consistency_validator.rb +34 -6
- data/lib/jekyll_ai_visible_content/version.rb +1 -1
- metadata +7 -3
checksums.yaml
CHANGED
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
---
|
|
2
2
|
SHA256:
|
|
3
|
-
metadata.gz:
|
|
4
|
-
data.tar.gz:
|
|
3
|
+
metadata.gz: 70c2a75f924f17bb54ccd80796dc0da7f6ee26aeed7e7dc8555be88ad95c2497
|
|
4
|
+
data.tar.gz: c13846d123a0e9a81e3d4cea3bd31a5e81b9af24885e704cf5a8c586fd4220f9
|
|
5
5
|
SHA512:
|
|
6
|
-
metadata.gz:
|
|
7
|
-
data.tar.gz:
|
|
6
|
+
metadata.gz: 388b85f05e6b0e3ac72f7c09f3e278c6505c0a640c6e307882c57e7de9716eb6671dd9534102f4a77aa32c41cc54a71d9578f52158ce5263a0af00f23a21a11a
|
|
7
|
+
data.tar.gz: fc73eef2bf48aa2867e6c146f56644f6962c104f079cbd4bf02c512f8ba047809823bcacab390b9f77b1e278ecad5b9a1aec56040c2bae21c5629be236991f10
|
data/.gitignore
CHANGED
data/CHANGELOG.md
CHANGED
|
@@ -1,6 +1,71 @@
|
|
|
1
1
|
# Changelog
|
|
2
2
|
|
|
3
|
-
## 0.
|
|
3
|
+
## 0.4.6 (2026-04-07)
|
|
4
|
+
|
|
5
|
+
- Fix entity auto-linking to avoid nested `<a>` tags by skipping replacements inside existing anchor blocks
|
|
6
|
+
- Add integration regression coverage for homepage nested-anchor prevention
|
|
7
|
+
- Resolve remaining RuboCop style offense in `EntityClassifier`
|
|
8
|
+
|
|
9
|
+
## 0.4.5 (2026-04-07)
|
|
10
|
+
|
|
11
|
+
- Apply safe layout fix by moving `link[rel="ai:*"]` injection into `<head>` while keeping AI instruction block before `</body>`
|
|
12
|
+
- Avoid appending raw `<link>` elements at the end of `<body>` to prevent theme/script edge-case rendering issues
|
|
13
|
+
- Keep AI resource discovery behavior unchanged for JSON/YAML/Markdown links
|
|
14
|
+
|
|
15
|
+
## 0.4.4 (2026-04-07)
|
|
16
|
+
|
|
17
|
+
- Refine AI page markdown output to exclude full Jekyll front matter and keep only AI-relevant intro metadata
|
|
18
|
+
- Build structured AI-readable markdown preface from `title`, `subtitle`, and `description`
|
|
19
|
+
- Keep body content markdown while stripping Liquid/Jekyll template directives for cleaner LLM ingestion
|
|
20
|
+
|
|
21
|
+
## 0.4.3 (2026-04-07)
|
|
22
|
+
|
|
23
|
+
- Serve `/ai/page/*.md` as raw markdown output (not HTML-rendered) by generating text-backed pages with `.md` permalinks
|
|
24
|
+
- Strip Liquid/Jekyll service tags (`{% ... %}`, `{{ ... }}`, comment blocks) from AI markdown content for cleaner machine-readable text
|
|
25
|
+
- Read markdown content from source files to avoid leaking internal Jekyll runtime objects into AI resources
|
|
26
|
+
|
|
27
|
+
## 0.4.2 (2026-04-07)
|
|
28
|
+
|
|
29
|
+
- Ensure AI link and instruction injection also works reliably on home and about pages via URL normalization/fallback lookup
|
|
30
|
+
- Generate page-level markdown resources under `/ai/page/<slug>.md` using real source front matter/content instead of entity summary markdown
|
|
31
|
+
- Improve markdown resource slug normalization and add coverage for home-page injection and page-markdown outputs
|
|
32
|
+
|
|
33
|
+
## 0.4.1 (2026-04-07)
|
|
34
|
+
|
|
35
|
+
- Add fallback entity classification for general articles (derive stable topic slug from page URL/title when explicit entities are absent)
|
|
36
|
+
- Ensure AI resources are generated for ordinary posts/pages so AI link injection can still occur
|
|
37
|
+
- Add regression test coverage for general-article fallback classification
|
|
38
|
+
|
|
39
|
+
## 0.4.0 (2026-04-07)
|
|
40
|
+
|
|
41
|
+
- Add automatic AI resource generation per content page with deterministic `/ai/<type>/<slug>.{json,yml,md}` outputs
|
|
42
|
+
- Add content-aware entity classification heuristics (person/entity/topic) from front matter and page content
|
|
43
|
+
- Inject `<link rel="ai:*">` tags and AI parsing instruction block before `</body>` in rendered HTML
|
|
44
|
+
- Add `{% ai_resource_links %}` Liquid fallback for manual layout integration
|
|
45
|
+
- Exclude generated `/ai/` resources from content filtering/orphan detection
|
|
46
|
+
- Add unit and integration coverage for AI resource generation and HTML injection flow
|
|
47
|
+
|
|
48
|
+
## 0.3.0 (2026-04-07)
|
|
49
|
+
|
|
50
|
+
- Fix false positives for orphan-page detection by analyzing rendered HTML instead of raw source content
|
|
51
|
+
- Build inbound-link graph from final `<a href>` values produced by Liquid/layout rendering
|
|
52
|
+
- Add canonical URL normalization for orphan analysis:
|
|
53
|
+
- strip query strings and hash fragments
|
|
54
|
+
- normalize `index.html` to directory URLs
|
|
55
|
+
- normalize trailing slashes for non-file paths
|
|
56
|
+
- resolve absolute internal URLs and handle `baseurl`
|
|
57
|
+
- Add regression tests for Liquid-generated links and URL normalization in content graph
|
|
58
|
+
|
|
59
|
+
## 0.2.0 (2026-04-07)
|
|
60
|
+
|
|
61
|
+
- Add shared content filtering module to reduce validator noise on assets/generated pages
|
|
62
|
+
- Improve entity consistency checks with `entity.author_aliases` and `_data/authors.yml` resolution
|
|
63
|
+
- Add robots.txt conflict detection to skip generation when a site already provides `robots.txt`
|
|
64
|
+
- Add grouped validation output with counts and configurable examples (`validation.max_examples`)
|
|
65
|
+
- Add new validation config defaults: `content_only`, `exclude_paths`, `verbose`, `max_examples`
|
|
66
|
+
- Filter entity-map mention scanning to authored content pages only
|
|
67
|
+
|
|
68
|
+
## 0.1.0
|
|
4
69
|
|
|
5
70
|
- Initial release
|
|
6
71
|
- JSON-LD generation: Person, BlogPosting, WebSite, BreadcrumbList, FAQPage, HowTo
|
data/Gemfile.lock
CHANGED
data/README.md
CHANGED
|
@@ -90,6 +90,8 @@ ai_visible_content:
|
|
|
90
90
|
same_as: # Links to authoritative profiles
|
|
91
91
|
- https://linkedin.com/in/handle
|
|
92
92
|
- https://github.com/handle
|
|
93
|
+
author_aliases: # Slugs that map to the canonical name
|
|
94
|
+
- your-slug # e.g., from _data/authors.yml keys
|
|
93
95
|
works_for:
|
|
94
96
|
type: Organization
|
|
95
97
|
name: "Company Name"
|
|
@@ -130,6 +132,7 @@ ai_visible_content:
|
|
|
130
132
|
# --- Internal Linking ---
|
|
131
133
|
linking:
|
|
132
134
|
enable_entity_links: true # Auto-link known entities in post body
|
|
135
|
+
apply_to_metadata: false # Safe default: never inject <a> into head/SEO/JSON-LD/feed fields
|
|
133
136
|
entity_definitions: {} # Custom: slug -> {name, url, description}
|
|
134
137
|
max_links_per_entity_per_post: 1
|
|
135
138
|
enable_related_posts: true
|
|
@@ -143,8 +146,18 @@ ai_visible_content:
|
|
|
143
146
|
warn_orphan_pages: true
|
|
144
147
|
warn_missing_descriptions: true
|
|
145
148
|
fail_build_on_error: false # true = exit 1 on validation failure
|
|
149
|
+
content_only: true # Only validate authored HTML content pages
|
|
150
|
+
exclude_paths: [] # Glob patterns to skip: ["/custom/*", "/drafts/*"]
|
|
151
|
+
verbose: false # true = show every warning; false = grouped summary
|
|
152
|
+
max_examples: 3 # Max examples per warning group in summary mode
|
|
146
153
|
```
|
|
147
154
|
|
|
155
|
+
### Entity Linking Safety
|
|
156
|
+
|
|
157
|
+
`linking.apply_to_metadata` defaults to `false` to keep metadata as plain text. With this default, entity auto-linking is applied to article body content only and is not applied to `<head>` meta tags, JSON-LD descriptions, or feed summaries.
|
|
158
|
+
|
|
159
|
+
Set `linking.apply_to_metadata: true` only if you explicitly want legacy full-document linking behavior.
|
|
160
|
+
|
|
148
161
|
## Layout Integration
|
|
149
162
|
|
|
150
163
|
### Automatic Mode (Recommended)
|
|
@@ -320,14 +333,81 @@ Normalized, lowercase, hyphenated. Each tag can serve as a topic hub page.
|
|
|
320
333
|
|
|
321
334
|
## Build Validation
|
|
322
335
|
|
|
323
|
-
During `jekyll build`, the plugin
|
|
336
|
+
During `jekyll build`, the plugin validates your site and prints a grouped summary:
|
|
337
|
+
|
|
338
|
+
```
|
|
339
|
+
AI Visible Content: === Validation Report ===
|
|
340
|
+
AI Visible Content: 1 posts missing last_modified_at (freshness scoring disabled)
|
|
341
|
+
AI Visible Content: Missing last_modified_at in _posts/2026-03-04-redis.md
|
|
342
|
+
AI Visible Content: 1 content pages missing description
|
|
343
|
+
AI Visible Content: Missing description in _posts/2018-01-18-first-post.md
|
|
344
|
+
```
|
|
345
|
+
|
|
346
|
+
### What Gets Checked
|
|
347
|
+
|
|
348
|
+
| Check | Description | Config key |
|
|
349
|
+
|-------|-------------|------------|
|
|
350
|
+
| Name inconsistency | `site.author` or post `author:` differs from `entity.name` | `warn_name_inconsistency` |
|
|
351
|
+
| Missing sameAs | No links to LinkedIn, GitHub, etc. | `warn_missing_same_as` |
|
|
352
|
+
| Missing dateModified | Posts without `last_modified_at` | `warn_missing_dates` |
|
|
353
|
+
| Missing description | Content pages without `description` in front matter | `warn_missing_descriptions` |
|
|
354
|
+
| Orphan pages | Content pages with zero inbound internal links | `warn_orphan_pages` |
|
|
355
|
+
| Generic titles | Titles like "About" without entity name | always on |
|
|
356
|
+
|
|
357
|
+
### Content-Only Filtering
|
|
358
|
+
|
|
359
|
+
By default (`content_only: true`), validation only checks authored HTML content pages. It automatically skips:
|
|
360
|
+
|
|
361
|
+
- **Generated files**: `robots.txt`, `llms.txt`, `entity-map.json`, `sitemap.xml`, `feed.xml`
|
|
362
|
+
- **Asset files**: `.js`, `.css`, `.json`, `.xml`, `.map`, `.webmanifest`
|
|
363
|
+
- **Tag/category pages**: `/tags/*`, `/categories/*`
|
|
364
|
+
- **Utility pages**: `404.html`, `redirect.html`, pagination pages (`/page2/`, etc.)
|
|
365
|
+
- **Assets directory**: anything under `/assets/`
|
|
366
|
+
|
|
367
|
+
Set `content_only: false` to validate all pages (not recommended for most sites).
|
|
368
|
+
|
|
369
|
+
### Author Alias Resolution
|
|
370
|
+
|
|
371
|
+
Jekyll themes like Chirpy use `_data/authors.yml` to map author slugs to names. The plugin resolves author names through two mechanisms:
|
|
372
|
+
|
|
373
|
+
1. **Explicit aliases** via `entity.author_aliases`:
|
|
374
|
+
|
|
375
|
+
```yaml
|
|
376
|
+
ai_visible_content:
|
|
377
|
+
entity:
|
|
378
|
+
name: "Eugene Leontev"
|
|
379
|
+
author_aliases:
|
|
380
|
+
- eugene
|
|
381
|
+
- nasuta
|
|
382
|
+
```
|
|
383
|
+
|
|
384
|
+
2. **Automatic resolution** via `_data/authors.yml`: if a post's `author:` value is a key in `_data/authors.yml` whose `name` matches `entity.name`, no warning is emitted.
|
|
385
|
+
|
|
386
|
+
### robots.txt Conflict Detection
|
|
387
|
+
|
|
388
|
+
If your site already has a `robots.txt` (as a source file or static file), the plugin skips generation and logs a warning. Either:
|
|
389
|
+
- Set `crawlers.generate_robots_txt: false` to silence the warning
|
|
390
|
+
- Remove your existing `robots.txt` to use the generated one with AI crawler rules
|
|
391
|
+
|
|
392
|
+
### Excluding Paths from Validation
|
|
393
|
+
|
|
394
|
+
Use `validation.exclude_paths` to skip specific paths:
|
|
395
|
+
|
|
396
|
+
```yaml
|
|
397
|
+
ai_visible_content:
|
|
398
|
+
validation:
|
|
399
|
+
exclude_paths:
|
|
400
|
+
- "/drafts/*"
|
|
401
|
+
- "/archive/*"
|
|
402
|
+
```
|
|
403
|
+
|
|
404
|
+
### Verbose Mode
|
|
405
|
+
|
|
406
|
+
Set `validation.verbose: true` to see every individual warning instead of grouped summaries. Useful for debugging but noisy on large sites.
|
|
407
|
+
|
|
408
|
+
### Orphan Detection Limitation
|
|
324
409
|
|
|
325
|
-
|
|
326
|
-
- **Missing sameAs**: No links to LinkedIn, GitHub, etc.
|
|
327
|
-
- **Missing dateModified**: Posts without `last_modified_at` (hurts freshness scoring)
|
|
328
|
-
- **Missing description**: Pages without `description` in front matter
|
|
329
|
-
- **Orphan pages**: Pages with zero inbound internal links
|
|
330
|
-
- **Generic titles**: Titles like "About" without entity name
|
|
410
|
+
Orphan detection scans raw Markdown/HTML content for `href=` links. It cannot detect links generated by Liquid templates (e.g., `{{ post.url }}` in `{% for post in site.posts %}`). This means posts linked only through theme-generated navigation may still appear as orphans. This is a known limitation.
|
|
331
411
|
|
|
332
412
|
Set `validation.fail_build_on_error: true` to make errors break the build in CI.
|
|
333
413
|
|
|
@@ -16,7 +16,7 @@ Gem::Specification.new do |spec|
|
|
|
16
16
|
|
|
17
17
|
spec.required_ruby_version = '>= 3.2'
|
|
18
18
|
spec.metadata['homepage_uri'] = spec.homepage
|
|
19
|
-
spec.metadata['source_code_uri'] = spec.homepage
|
|
19
|
+
spec.metadata['source_code_uri'] = "#{spec.homepage}.git"
|
|
20
20
|
spec.metadata['changelog_uri'] = "#{spec.homepage}/blob/master/CHANGELOG.md"
|
|
21
21
|
spec.metadata['rubygems_mfa_required'] = 'true'
|
|
22
22
|
|
|
@@ -5,6 +5,7 @@ require 'json'
|
|
|
5
5
|
|
|
6
6
|
require_relative 'jekyll_ai_visible_content/version'
|
|
7
7
|
require_relative 'jekyll_ai_visible_content/configuration'
|
|
8
|
+
require_relative 'jekyll_ai_visible_content/content_filter'
|
|
8
9
|
require_relative 'jekyll_ai_visible_content/entity/person'
|
|
9
10
|
require_relative 'jekyll_ai_visible_content/entity/organization'
|
|
10
11
|
require_relative 'jekyll_ai_visible_content/entity/registry'
|
|
@@ -16,15 +17,18 @@ require_relative 'jekyll_ai_visible_content/json_ld/breadcrumb_schema'
|
|
|
16
17
|
require_relative 'jekyll_ai_visible_content/json_ld/faq_schema'
|
|
17
18
|
require_relative 'jekyll_ai_visible_content/json_ld/how_to_schema'
|
|
18
19
|
require_relative 'jekyll_ai_visible_content/json_ld/collection_schema'
|
|
20
|
+
require_relative 'jekyll_ai_visible_content/entity_classifier'
|
|
19
21
|
require_relative 'jekyll_ai_visible_content/generators/llms_txt_generator'
|
|
20
22
|
require_relative 'jekyll_ai_visible_content/generators/robots_txt_generator'
|
|
21
23
|
require_relative 'jekyll_ai_visible_content/generators/entity_map_generator'
|
|
22
24
|
require_relative 'jekyll_ai_visible_content/generators/content_graph_generator'
|
|
25
|
+
require_relative 'jekyll_ai_visible_content/generators/ai_resource_generator'
|
|
23
26
|
require_relative 'jekyll_ai_visible_content/tags/ai_json_ld_tag'
|
|
24
27
|
require_relative 'jekyll_ai_visible_content/tags/ai_author_tag'
|
|
25
28
|
require_relative 'jekyll_ai_visible_content/tags/ai_entity_link_tag'
|
|
26
29
|
require_relative 'jekyll_ai_visible_content/tags/ai_related_posts_tag'
|
|
27
30
|
require_relative 'jekyll_ai_visible_content/tags/ai_breadcrumb_tag'
|
|
31
|
+
require_relative 'jekyll_ai_visible_content/tags/ai_resource_links_tag'
|
|
28
32
|
require_relative 'jekyll_ai_visible_content/filters/naming_filter'
|
|
29
33
|
require_relative 'jekyll_ai_visible_content/filters/entity_filter'
|
|
30
34
|
require_relative 'jekyll_ai_visible_content/hooks/post_render_hook'
|
|
@@ -19,7 +19,8 @@ module JekyllAiVisibleContent
|
|
|
19
19
|
'knows_about' => [],
|
|
20
20
|
'same_as' => [],
|
|
21
21
|
'works_for' => nil,
|
|
22
|
-
'occupation' => nil
|
|
22
|
+
'occupation' => nil,
|
|
23
|
+
'author_aliases' => []
|
|
23
24
|
},
|
|
24
25
|
'json_ld' => {
|
|
25
26
|
'auto_inject' => true,
|
|
@@ -49,6 +50,7 @@ module JekyllAiVisibleContent
|
|
|
49
50
|
},
|
|
50
51
|
'linking' => {
|
|
51
52
|
'enable_entity_links' => true,
|
|
53
|
+
'apply_to_metadata' => false,
|
|
52
54
|
'entity_definitions' => {},
|
|
53
55
|
'max_links_per_entity_per_post' => 1,
|
|
54
56
|
'enable_related_posts' => true,
|
|
@@ -60,7 +62,19 @@ module JekyllAiVisibleContent
|
|
|
60
62
|
'warn_missing_dates' => true,
|
|
61
63
|
'warn_orphan_pages' => true,
|
|
62
64
|
'warn_missing_descriptions' => true,
|
|
63
|
-
'fail_build_on_error' => false
|
|
65
|
+
'fail_build_on_error' => false,
|
|
66
|
+
'exclude_paths' => [],
|
|
67
|
+
'content_only' => true,
|
|
68
|
+
'verbose' => false,
|
|
69
|
+
'max_examples' => 3
|
|
70
|
+
},
|
|
71
|
+
'ai_resources' => {
|
|
72
|
+
'enabled' => true,
|
|
73
|
+
'formats' => %w[json yaml markdown],
|
|
74
|
+
'max_links_per_page' => 5,
|
|
75
|
+
'auto_inject' => true,
|
|
76
|
+
'inject_instruction_block' => true,
|
|
77
|
+
'base_path' => '/ai'
|
|
64
78
|
}
|
|
65
79
|
}.freeze
|
|
66
80
|
|
|
@@ -108,6 +122,10 @@ module JekyllAiVisibleContent
|
|
|
108
122
|
@raw['validation']
|
|
109
123
|
end
|
|
110
124
|
|
|
125
|
+
def ai_resources
|
|
126
|
+
@raw['ai_resources']
|
|
127
|
+
end
|
|
128
|
+
|
|
111
129
|
def site_url
|
|
112
130
|
@site.config['url'] || ''
|
|
113
131
|
end
|
|
@@ -0,0 +1,91 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
module JekyllAiVisibleContent
|
|
4
|
+
module ContentFilter
|
|
5
|
+
GENERATED_NAMES = %w[
|
|
6
|
+
robots.txt llms.txt llms-full.txt entity-map.json
|
|
7
|
+
sitemap.xml feed.xml atom.xml redirects.json
|
|
8
|
+
].freeze
|
|
9
|
+
|
|
10
|
+
ASSET_EXTENSIONS = %w[
|
|
11
|
+
.js .css .scss .map .json .xml .txt .webmanifest .ico .svg .png .jpg .jpeg .gif .woff .woff2 .ttf .eot
|
|
12
|
+
].freeze
|
|
13
|
+
|
|
14
|
+
UTILITY_PATH_PATTERNS = [
|
|
15
|
+
%r{^/404\.html$},
|
|
16
|
+
%r{^/tags/},
|
|
17
|
+
%r{^/categories/},
|
|
18
|
+
%r{^/assets/},
|
|
19
|
+
%r{^/page\d+/},
|
|
20
|
+
%r{^/norobots/},
|
|
21
|
+
%r{^/ai/}
|
|
22
|
+
].freeze
|
|
23
|
+
|
|
24
|
+
class << self
|
|
25
|
+
def content_page?(doc, config = nil)
|
|
26
|
+
return false unless html_output?(doc)
|
|
27
|
+
return false if generated_file?(doc)
|
|
28
|
+
return false if asset_path?(doc)
|
|
29
|
+
return false if redirect_page?(doc)
|
|
30
|
+
return false if utility_page?(doc)
|
|
31
|
+
return false if excluded_path?(doc, config)
|
|
32
|
+
|
|
33
|
+
true
|
|
34
|
+
end
|
|
35
|
+
|
|
36
|
+
def content_pages(site, config = nil)
|
|
37
|
+
site.posts.docs + site.pages.select { |p| content_page?(p, config) }
|
|
38
|
+
end
|
|
39
|
+
|
|
40
|
+
private
|
|
41
|
+
|
|
42
|
+
def html_output?(doc)
|
|
43
|
+
ext = doc.respond_to?(:output_ext) ? doc.output_ext : nil
|
|
44
|
+
ext ||= File.extname(doc.respond_to?(:name) ? doc.name.to_s : doc.url.to_s)
|
|
45
|
+
return true if ['.html', '.htm', '.md', '.markdown'].include?(ext)
|
|
46
|
+
|
|
47
|
+
url = doc.respond_to?(:url) ? doc.url.to_s : ''
|
|
48
|
+
url.end_with?('/') && !url.match?(/\.\w+$/)
|
|
49
|
+
end
|
|
50
|
+
|
|
51
|
+
def generated_file?(doc)
|
|
52
|
+
name = doc.respond_to?(:name) ? doc.name : File.basename(doc.url.to_s)
|
|
53
|
+
GENERATED_NAMES.include?(name)
|
|
54
|
+
end
|
|
55
|
+
|
|
56
|
+
def asset_path?(doc)
|
|
57
|
+
url = doc.respond_to?(:url) ? doc.url.to_s : ''
|
|
58
|
+
return true if url.start_with?('/assets/')
|
|
59
|
+
|
|
60
|
+
ext = File.extname(url)
|
|
61
|
+
ASSET_EXTENSIONS.include?(ext)
|
|
62
|
+
end
|
|
63
|
+
|
|
64
|
+
def redirect_page?(doc)
|
|
65
|
+
return true if doc.respond_to?(:data) && doc.data['redirect_to']
|
|
66
|
+
|
|
67
|
+
name = doc.respond_to?(:name) ? doc.name : File.basename(doc.url.to_s)
|
|
68
|
+
name == 'redirect.html'
|
|
69
|
+
end
|
|
70
|
+
|
|
71
|
+
def utility_page?(doc)
|
|
72
|
+
url = doc.respond_to?(:url) ? doc.url.to_s : ''
|
|
73
|
+
UTILITY_PATH_PATTERNS.any? { |pattern| url.match?(pattern) }
|
|
74
|
+
end
|
|
75
|
+
|
|
76
|
+
def excluded_path?(doc, config)
|
|
77
|
+
return false unless config
|
|
78
|
+
|
|
79
|
+
exclude_paths = config.validation['exclude_paths']
|
|
80
|
+
return false unless exclude_paths&.any?
|
|
81
|
+
|
|
82
|
+
url = doc.respond_to?(:url) ? doc.url.to_s : ''
|
|
83
|
+
path = doc.respond_to?(:relative_path) ? doc.relative_path.to_s : url
|
|
84
|
+
|
|
85
|
+
exclude_paths.any? do |pattern|
|
|
86
|
+
File.fnmatch?(pattern, url) || File.fnmatch?(pattern, path)
|
|
87
|
+
end
|
|
88
|
+
end
|
|
89
|
+
end
|
|
90
|
+
end
|
|
91
|
+
end
|
|
@@ -0,0 +1,122 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
module JekyllAiVisibleContent
|
|
4
|
+
module EntityClassifier
|
|
5
|
+
RELEVANCE_FRONT_MATTER = 3
|
|
6
|
+
RELEVANCE_TITLE = 2
|
|
7
|
+
RELEVANCE_BODY = 1
|
|
8
|
+
|
|
9
|
+
class << self
|
|
10
|
+
def classify_page(doc, config)
|
|
11
|
+
max = config.ai_resources['max_links_per_page'] || 5
|
|
12
|
+
entities = []
|
|
13
|
+
|
|
14
|
+
add_primary_entity(entities, doc, config)
|
|
15
|
+
add_front_matter_topics(entities, doc, config)
|
|
16
|
+
add_detected_topics(entities, doc, config)
|
|
17
|
+
add_organization(entities, doc, config)
|
|
18
|
+
add_general_topic_fallback(entities, doc)
|
|
19
|
+
|
|
20
|
+
entities
|
|
21
|
+
.uniq { |e| e[:slug] }
|
|
22
|
+
.sort_by { |e| -e[:relevance] }
|
|
23
|
+
.first(max)
|
|
24
|
+
end
|
|
25
|
+
|
|
26
|
+
def slugify(name)
|
|
27
|
+
name.to_s.downcase.gsub(/[^a-z0-9]+/, '-').gsub(/(^-|-$)/, '')
|
|
28
|
+
end
|
|
29
|
+
|
|
30
|
+
private
|
|
31
|
+
|
|
32
|
+
def add_primary_entity(entities, doc, config)
|
|
33
|
+
return unless config.entity['name']
|
|
34
|
+
|
|
35
|
+
is_person_page = doc.data['entity_type']&.downcase == 'person' ||
|
|
36
|
+
doc.url.to_s.match?(%r{/about/?$})
|
|
37
|
+
|
|
38
|
+
return unless is_person_page
|
|
39
|
+
|
|
40
|
+
entities << {
|
|
41
|
+
type: config.entity_type.downcase == 'organization' ? 'entity' : 'person',
|
|
42
|
+
slug: slugify(config.entity['id_slug'] || config.entity['name']),
|
|
43
|
+
name: config.entity['name'],
|
|
44
|
+
relevance: RELEVANCE_FRONT_MATTER + 1
|
|
45
|
+
}
|
|
46
|
+
end
|
|
47
|
+
|
|
48
|
+
def add_front_matter_topics(entities, doc, _config)
|
|
49
|
+
topics = doc.data['topics']
|
|
50
|
+
return unless topics.is_a?(Array)
|
|
51
|
+
|
|
52
|
+
topics.each do |topic|
|
|
53
|
+
entities << {
|
|
54
|
+
type: 'topic',
|
|
55
|
+
slug: slugify(topic),
|
|
56
|
+
name: topic,
|
|
57
|
+
relevance: RELEVANCE_FRONT_MATTER
|
|
58
|
+
}
|
|
59
|
+
end
|
|
60
|
+
end
|
|
61
|
+
|
|
62
|
+
def add_detected_topics(entities, doc, config)
|
|
63
|
+
known_topics = config.entity['knows_about'] || []
|
|
64
|
+
return if known_topics.empty?
|
|
65
|
+
|
|
66
|
+
title = (doc.data['title'] || '').downcase
|
|
67
|
+
description = (doc.data['description'] || '').downcase
|
|
68
|
+
body = (doc.content || '').downcase
|
|
69
|
+
|
|
70
|
+
known_topics.each do |topic|
|
|
71
|
+
needle = topic.downcase
|
|
72
|
+
relevance = if title.include?(needle) || description.include?(needle)
|
|
73
|
+
RELEVANCE_TITLE
|
|
74
|
+
elsif body.include?(needle)
|
|
75
|
+
RELEVANCE_BODY
|
|
76
|
+
end
|
|
77
|
+
next unless relevance
|
|
78
|
+
|
|
79
|
+
entities << { type: 'topic', slug: slugify(topic), name: topic, relevance: relevance }
|
|
80
|
+
end
|
|
81
|
+
end
|
|
82
|
+
|
|
83
|
+
def add_organization(entities, doc, config)
|
|
84
|
+
works_for = config.entity['works_for']
|
|
85
|
+
return unless works_for.is_a?(Hash) && works_for['name']
|
|
86
|
+
|
|
87
|
+
text = "#{doc.data['title']} #{doc.data['description']} #{doc.content}".downcase
|
|
88
|
+
return unless text.include?(works_for['name'].downcase)
|
|
89
|
+
|
|
90
|
+
entities << {
|
|
91
|
+
type: 'entity',
|
|
92
|
+
slug: slugify(works_for['name']),
|
|
93
|
+
name: works_for['name'],
|
|
94
|
+
relevance: RELEVANCE_BODY
|
|
95
|
+
}
|
|
96
|
+
end
|
|
97
|
+
|
|
98
|
+
def add_general_topic_fallback(entities, doc)
|
|
99
|
+
return unless entities.empty?
|
|
100
|
+
|
|
101
|
+
slug = slugify(doc.url.to_s.split('/').reject(&:empty?).last)
|
|
102
|
+
title = doc.data['title'].to_s.strip
|
|
103
|
+
|
|
104
|
+
slug = slugify(title) if slug.empty?
|
|
105
|
+
return if slug.empty?
|
|
106
|
+
|
|
107
|
+
name = if title.empty?
|
|
108
|
+
slug.tr('-', ' ').split.map(&:capitalize).join(' ')
|
|
109
|
+
else
|
|
110
|
+
title
|
|
111
|
+
end
|
|
112
|
+
|
|
113
|
+
entities << {
|
|
114
|
+
type: 'topic',
|
|
115
|
+
slug: slug,
|
|
116
|
+
name: name,
|
|
117
|
+
relevance: RELEVANCE_BODY
|
|
118
|
+
}
|
|
119
|
+
end
|
|
120
|
+
end
|
|
121
|
+
end
|
|
122
|
+
end
|