jekyll-ai-related 0.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,35 @@
1
+ # frozen_string_literal: true
2
+
3
+ require_relative "lib/jekyll/embeddings-generator/version"
4
+
5
+ Gem::Specification.new do |spec|
6
+ spec.name = "jekyll-ai-related"
7
+ spec.version = Jekyll::EmbeddingsGenerator::VERSION
8
+ spec.authors = ["Francesco Pira"]
9
+ spec.email = ["dev@fpira.com"]
10
+
11
+ spec.summary = "Jekyll plugin to generate embeddings for posts and find related content"
12
+ spec.description = "A Jekyll plugin that uses OpenAI embeddings to analyze posts and find related content"
13
+ spec.homepage = "https://github.com/pirafrank/jekyll-ai-related"
14
+
15
+ spec.license = "MIT"
16
+ spec.required_ruby_version = ">= 3.2.0"
17
+
18
+ spec.files = `git ls-files -z`.split("\x0")
19
+ spec.executables = spec.files.grep(%r!^bin/!) { |f| File.basename(f) }
20
+ spec.require_paths = ["lib"]
21
+
22
+ spec.metadata["allowed_push_host"] = "https://rubygems.org"
23
+
24
+ spec.metadata["homepage_uri"] = spec.homepage
25
+ spec.metadata["changelog_uri"] = "https://github.com/pirafrank/jekyll-ai-related/blob/main/CHANGELOG.md"
26
+ spec.metadata["bug_tracker_uri"] = "https://github.com/pirafrank/jekyll-ai-related/issues"
27
+
28
+ spec.add_dependency "httparty", "~> 0.22.0"
29
+ spec.add_dependency "jekyll", ">= 3.7", "< 5.0"
30
+ spec.add_dependency "json", "~> 2.7"
31
+
32
+ spec.add_development_dependency "bundler", "~> 2.6"
33
+ spec.add_development_dependency "rake", "~> 13.0"
34
+ spec.add_development_dependency "rubocop-jekyll", "~> 0.14"
35
+ end
@@ -0,0 +1,35 @@
1
+ # frozen_string_literal: true
2
+
3
+ module Jekyll
4
+ module Commands
5
+ class EmbeddingsGenerator < Command
6
+ class << self
7
+ def init_with_program(prog)
8
+ prog.command(:related) do |c|
9
+ c.description "Generate embeddings for each post and find related posts."
10
+ c.syntax "embeddings [options]"
11
+
12
+ c.option "debug",
13
+ "--debug",
14
+ "Most verbose. Set log level to Debug."
15
+ c.option "quiet",
16
+ "--quiet",
17
+ "Do not print Info logs. Set log level to Error."
18
+ c.option "future",
19
+ "--future",
20
+ "Get embeds and fine related posts also for those with a future date."
21
+ c.option "drafts",
22
+ "--drafts",
23
+ "Get embeds and find related posts also for drafts."
24
+
25
+ c.action do |_, opts|
26
+ Jekyll.logger.info "AI Related plugin starting..."
27
+ options = configuration_from_options(opts)
28
+ Jekyll::EmbeddingsGenerator.run(options)
29
+ end
30
+ end
31
+ end
32
+ end
33
+ end
34
+ end
35
+ end
@@ -0,0 +1,34 @@
1
+ # frozen_string_literal: true
2
+
3
+ require "httparty"
4
+ require "json"
5
+
6
+ module Jekyll
7
+ module EmbeddingsGenerator
8
+ module Embeddings
9
+ class << self
10
+ include Jekyll::EmbeddingsGenerator
11
+
12
+ def generate_embeddings(text)
13
+ config = Jekyll::EmbeddingsGenerator.config
14
+ api_key = config["openai_api_key"]
15
+ response = HTTParty.post(
16
+ "https://api.openai.com/v1/embeddings",
17
+ :headers => {
18
+ "Authorization" => "Bearer #{api_key}",
19
+ "Content-Type" => "application/json",
20
+ },
21
+ :body => {
22
+ :model => "text-embedding-3-small",
23
+ :input => text,
24
+ }.to_json
25
+ )
26
+
27
+ raise "OpenAI API error: #{response.parsed_response["error"]["message"]}" unless response.success?
28
+
29
+ response.parsed_response["data"][0]["embedding"]
30
+ end
31
+ end
32
+ end
33
+ end
34
+ end
@@ -0,0 +1,151 @@
1
+ # frozen_string_literal: true
2
+
3
+ require "httparty"
4
+ require "json"
5
+
6
+ module Jekyll
7
+ module EmbeddingsGenerator
8
+ module Store
9
+ class << self
10
+ include Jekyll::EmbeddingsGenerator
11
+
12
+ def store_embedding(data) # rubocop:disable Metrics/AbcSize
13
+ config = Jekyll::EmbeddingsGenerator.config
14
+ supabase_url = config["supabase_url"]
15
+ supabase_key = config["supabase_key"]
16
+
17
+ # First check if record exists and its edit date
18
+ existing = HTTParty.get(
19
+ "#{supabase_url}/rest/v1/page_embeddings",
20
+ :headers => {
21
+ "apikey" => supabase_key,
22
+ "Authorization" => "Bearer #{supabase_key}",
23
+ "Content-Type" => "application/json",
24
+ "Accept-Encoding" => "identity", # this to avoid supabase returning gzipped content
25
+ },
26
+ :query => {
27
+ "uid" => "eq.#{data.uid}",
28
+ "select" => "uid, most_recent_edit",
29
+ }
30
+ )
31
+
32
+ Jekyll.logger.debug "response headers: #{existing.headers}"
33
+ Jekyll.logger.debug "response body: #{existing.body}"
34
+
35
+ raise "Supabase API error: #{existing.code} - #{existing.body}" unless existing.success?
36
+
37
+ existing_record = existing.parsed_response&.first
38
+ mre = data.most_recent_edit
39
+ should_update = existing_record.nil? || Time.parse(existing_record["most_recent_edit"]) < mre
40
+
41
+ false unless should_update
42
+
43
+ update_embedding(data)
44
+ end
45
+
46
+ def find_related(post)
47
+ config = Jekyll::EmbeddingsGenerator.config
48
+ post_uid = post.data[config["uid"]]
49
+ embedding = query_embeddings(post_uid)
50
+ find_related_posts(embedding, post_uid)
51
+ end
52
+
53
+ private
54
+
55
+ def update_embedding(data)
56
+ config = Jekyll::EmbeddingsGenerator.config
57
+ supabase_url = config["supabase_url"]
58
+ supabase_key = config["supabase_key"]
59
+
60
+ response = HTTParty.post(
61
+ "#{supabase_url}/rest/v1/page_embeddings",
62
+ :headers => {
63
+ "apikey" => supabase_key,
64
+ "Authorization" => "Bearer #{supabase_key}",
65
+ "Content-Type" => "application/json",
66
+ "Prefer" => "resolution=merge-duplicates", # upsert behavior
67
+ },
68
+ :query => {
69
+ "on_conflict" => "uid", # important: this MUST be declared as unique on database
70
+ },
71
+ :body => {
72
+ :uid => data.uid,
73
+ :most_recent_edit => data.most_recent_edit,
74
+ :embedding => data.embedding,
75
+ :metadata => data.metadata,
76
+ :content => data.content,
77
+ }.to_json
78
+ )
79
+
80
+ return if response.success?
81
+
82
+ raise "Supabase API error: #{response.code} - #{response.body}"
83
+ end
84
+
85
+ def query_embeddings(post_uid)
86
+ config = Jekyll::EmbeddingsGenerator.config
87
+ supabase_url = config["supabase_url"]
88
+ supabase_key = config["supabase_key"]
89
+ response = HTTParty.get(
90
+ "#{supabase_url}/rest/v1/page_embeddings",
91
+ headers: {
92
+ "apikey" => supabase_key,
93
+ "Authorization" => "Bearer #{supabase_key}",
94
+ "Content-Type" => "application/json",
95
+ "Accept-Encoding" => "identity", # this to avoid supabase returning gzipped content
96
+ },
97
+ query: {
98
+ "uid" => "eq.#{post_uid}",
99
+ }
100
+ )
101
+ Jekyll.logger.debug "response.parsed_response: #{response.parsed_response}"
102
+ raise "Supabase API error: #{response.code} - #{response.body}" unless response.success?
103
+
104
+ response.parsed_response.first&.dig("embedding")
105
+ end
106
+
107
+ def find_related_posts(embedding, post_uid)
108
+ config = Jekyll::EmbeddingsGenerator.config
109
+ supabase_url = config["supabase_url"]
110
+ supabase_key = config["supabase_key"]
111
+ score_threshold = config["score_threshold"]
112
+ limit = config["limit"] || 3
113
+ # Query using cosine similarity
114
+ # Note: this MUST be a stored procedure on Supabase, and order of
115
+ # columns in 'select' statament must match the order of the
116
+ # columns defined in the stored procedure.
117
+ query = %(
118
+ select
119
+ metadata->>'title' as title,
120
+ uid as uid,
121
+ most_recent_edit,
122
+ metadata->>'url' as url,
123
+ metadata->>'date' as date,
124
+ 1 - (embedding <=> '#{embedding}') as similarity
125
+ from page_embeddings
126
+ where uid != '#{post_uid}'
127
+ and 1 - (embedding <=> '#{embedding}') > '#{score_threshold}'
128
+ order by embedding <=> '#{embedding}'
129
+ limit '#{limit}';
130
+ )
131
+ response = HTTParty.post(
132
+ "#{supabase_url}/rest/v1/rpc/related_posts",
133
+ headers: {
134
+ "apikey" => supabase_key,
135
+ "Authorization" => "Bearer #{supabase_key}",
136
+ "Content-Type" => "application/json",
137
+ "Accept-Encoding" => "identity", # this to avoid supabase returning gzipped content
138
+ "Prefer" => "return=minimal",
139
+ },
140
+ body: {
141
+ query:,
142
+ }.to_json
143
+ )
144
+ raise "Supabase API error: #{response.code} - #{response.body}" unless response.success?
145
+
146
+ response.parsed_response
147
+ end
148
+ end
149
+ end
150
+ end
151
+ end
@@ -0,0 +1,51 @@
1
+ # frozen_string_literal: true
2
+
3
+ module Jekyll
4
+ module EmbeddingsGenerator
5
+ class Configuration
6
+ include Jekyll::EmbeddingsGenerator
7
+
8
+ @config = {}
9
+
10
+ def self.init(opts) # rubocop:disable Metrics/AbcSize,Metrics/PerceivedComplexity
11
+ jk_config = Jekyll.configuration({})["jekyll-ai-related"] || {}
12
+ config = {}
13
+ config["uid"] = jk_config["post_unique_field"] || "slug"
14
+ config["mre"] = jk_config["post_updated_field"] || "date"
15
+ config["path"] = jk_config["output_path"] || "related_posts"
16
+ config["drafts"] = jk_config["include_drafts"] || opts["drafts"] || false
17
+ config["future"] = jk_config["include_future"] || opts["future"] || false
18
+ config["limit"] = jk_config["related_posts_limit"] || 3
19
+ config["score_threshold"] = jk_config["related_posts_score_threshold"] || 0.5
20
+ config["openai_api_key"] = ENV["OPENAI_API_KEY"]
21
+ config["supabase_url"] = ENV["SUPABASE_URL"]
22
+ config["supabase_key"] = ENV["SUPABASE_KEY"]
23
+ @config = config
24
+ validate
25
+ config
26
+ end
27
+
28
+ def self.build(options)
29
+ options["show_drafts"] = @config["drafts"]
30
+ options["future"] = @config["future"]
31
+ Jekyll.logger.debug "Show drafts? #{options["show_drafts"]}"
32
+ Jekyll.logger.debug "Include future posts? #{options["future"]}"
33
+ site = Jekyll::Site.new(options)
34
+ site.reset
35
+ site.read
36
+ # call the 'generate' method on all plugins inheriting from Jekyll::Generator.
37
+ # This allows to generate the site's content, including any additional data
38
+ # you may have added to the post objects via a custom plugin (which by default
39
+ # lives in the _plugins dir of you Jekyll installation).
40
+ site.generate
41
+ site
42
+ end
43
+
44
+ def self.validate
45
+ raise "Missing OpenAI API key" unless @config["openai_api_key"]
46
+ raise "Missing Supabase URL" unless @config["supabase_url"]
47
+ raise "Missing Supabase key" unless @config["supabase_key"]
48
+ end
49
+ end
50
+ end
51
+ end
@@ -0,0 +1,32 @@
1
+ # frozen_string_literal: true
2
+
3
+ require_relative "metadata"
4
+
5
+ module Jekyll
6
+ module EmbeddingsGenerator
7
+ class Data
8
+ include Jekyll::EmbeddingsGenerator
9
+
10
+ attr_reader :uid, :most_recent_edit, :embedding, :metadata, :content
11
+
12
+ def initialize(post, embedding, metadata)
13
+ config = Jekyll::EmbeddingsGenerator.config
14
+ @uid = post.data[config["uid"]]
15
+ @most_recent_edit = post.data[config["mre"]]
16
+ @embedding = embedding
17
+ @metadata = metadata.to_h
18
+ @content = post.content
19
+ end
20
+
21
+ def to_h
22
+ {
23
+ :uid => @uid,
24
+ :most_recent_edit => @most_recent_edit,
25
+ :embedding => @embedding,
26
+ :metadata => @metadata,
27
+ :content => @content,
28
+ }.compact
29
+ end
30
+ end
31
+ end
32
+ end
@@ -0,0 +1,40 @@
1
+ # frozen_string_literal: true
2
+
3
+ module Jekyll
4
+ module EmbeddingsGenerator
5
+ class Metadata
6
+ attr_reader :title, :subtitle, :description, :date, :slug, :uid,
7
+ :url, :categories, :tags, :updates, :last_edit
8
+
9
+ def initialize(post) # rubocop:disable Metrics/AbcSize
10
+ @title = post.data["title"]
11
+ @subtitle = post.data["subtitle"]
12
+ @description = post.data["description"]
13
+ @date = post.data["date"]
14
+ @slug = post.data["slug"]
15
+ @uid = post.data["uid"]
16
+ @url = post.url
17
+ @categories = post.data["categories"]
18
+ @tags = post.data["tags"]
19
+ @updates = post.data["updates"]
20
+ @last_edit = post.data["most_recent_edit"]
21
+ end
22
+
23
+ def to_h
24
+ {
25
+ :title => @title,
26
+ :subtitle => @subtitle,
27
+ :description => @description,
28
+ :date => @date,
29
+ :slug => @slug,
30
+ :uid => @uid,
31
+ :url => @url,
32
+ :categories => @categories,
33
+ :tags => @tags,
34
+ :updates => @updates,
35
+ :last_edit => @last_edit,
36
+ }.compact
37
+ end
38
+ end
39
+ end
40
+ end
@@ -0,0 +1,7 @@
1
+ # frozen_string_literal: true
2
+
3
+ module Jekyll
4
+ module EmbeddingsGenerator
5
+ VERSION = "0.1.0"
6
+ end
7
+ end
@@ -0,0 +1,78 @@
1
+ # frozen_string_literal: true
2
+
3
+ require_relative "commands/generator"
4
+ require_relative "embeddings-generator/init"
5
+ require_relative "embeddings-generator/version"
6
+ require_relative "embeddings-generator/embeddings/generate"
7
+ require_relative "embeddings-generator/embeddings/store"
8
+ require_relative "embeddings-generator/models/data"
9
+ require_relative "embeddings-generator/models/metadata"
10
+
11
+ module Jekyll
12
+ module EmbeddingsGenerator
13
+ class Error < StandardError; end
14
+ class << self
15
+ attr_reader :config, :site
16
+
17
+ def run(options)
18
+ @config = Configuration.init(options)
19
+ @site = Configuration.build(options)
20
+ extract_content
21
+ write_related_posts
22
+ end
23
+
24
+ private
25
+
26
+ def extract_content
27
+ Jekyll.logger.info "Embeddings Generator:", "Starting to process markdown files..."
28
+ # Generate and store embeddings per each post
29
+ @site.posts.docs.each do |post|
30
+ Jekyll.logger.info "Embeddings Generator:", "Processing post: #{post.data["title"]}"
31
+ # Extract content and metadata
32
+ content = post.content
33
+ metadata = Jekyll::EmbeddingsGenerator::Metadata.new(post)
34
+
35
+ # Generate embeddings using OpenAI API
36
+ embedding = Jekyll::EmbeddingsGenerator::Embeddings.generate_embeddings(content)
37
+
38
+ # Store in Supabase
39
+ data = Jekyll::EmbeddingsGenerator::Data.new(post, embedding, metadata)
40
+ Jekyll::EmbeddingsGenerator::Store.store_embedding(data)
41
+ end
42
+ Jekyll.logger.info "Embeddings Generator:", "Finished processing markdown files."
43
+ end
44
+
45
+ def write_related_posts
46
+ # Query vector database and find related posts per each post
47
+ @site.posts.docs.each do |post|
48
+ # Find related posts
49
+ related_posts = Jekyll::EmbeddingsGenerator::Store.find_related(post)
50
+ write_to_file(related_posts, post)
51
+
52
+ # Log related posts for debugging
53
+ Jekyll.logger.info "Related posts:", "Found #{related_posts.length} related posts for #{post.data[@config["uid"]]}"
54
+ rescue StandardError => e
55
+ Jekyll.logger.error "Related posts:", "Error processing #{post.data["title"]}: #{e.message}"
56
+ end
57
+ Jekyll.logger.info "Related posts:", "Finished writing markdown files."
58
+ end
59
+
60
+ def write_to_file(data, post)
61
+ return if data.empty?
62
+
63
+ # Create directory if it doesn't exist
64
+ subdir = @config["path"]
65
+ target_dir = File.join(@site.source, "_data", subdir)
66
+ FileUtils.mkdir_p(target_dir)
67
+ # Write related posts to file, overwriting if exists
68
+ safe_uid = safe_filename(post.data[@config["uid"]].to_s)
69
+ filename = File.join(target_dir, "#{safe_uid}.yml")
70
+ File.write(filename, data.to_yaml, mode: "w")
71
+ end
72
+
73
+ def safe_filename(filename)
74
+ filename.downcase.gsub(%r![^a-z0-9\-_]!, "-")
75
+ end
76
+ end
77
+ end
78
+ end
@@ -0,0 +1,3 @@
1
+ require "jekyll"
2
+
3
+ require_relative "jekyll/processor"
@@ -0,0 +1,6 @@
1
+ module Jekyll
2
+ module EmbeddingsGenerator
3
+ VERSION: String
4
+ # See the writing guide of rbs: https://github.com/ruby/rbs#guides
5
+ end
6
+ end
@@ -0,0 +1,38 @@
1
+ create extension if not exists vector;
2
+
3
+ -- Create a table to store the embeddings
4
+ --
5
+ -- Notes: timezone must match those set in _config.yml in your Jekyll website.
6
+ create table page_embeddings (
7
+ id bigint generated by default as identity primary key,
8
+ uid varchar(255) not null unique,
9
+ most_recent_edit timestamp with time zone default timezone('Europe/Rome'::text, now()) not null,
10
+ content text,
11
+ embedding vector(1536),
12
+ metadata jsonb,
13
+ created_at timestamp with time zone default timezone('Europe/Rome'::text, now())
14
+ );
15
+
16
+ -- Create a similarity search index
17
+ create index page_embeddings_embedding_idx
18
+ on page_embeddings
19
+ using ivfflat (embedding vector_cosine_ops)
20
+ with (lists = 100);
21
+
22
+ -- create a Postgres function to safely execute the similarity search
23
+ create or replace function related_posts(query text)
24
+ returns table (
25
+ title text,
26
+ uid varchar(255),
27
+ most_recent_edit timestamp with time zone,
28
+ url text,
29
+ date text,
30
+ similarity float
31
+ )
32
+ language plpgsql
33
+ security definer
34
+ as $$
35
+ begin
36
+ return query execute query;
37
+ end;
38
+ $$;
@@ -0,0 +1,5 @@
1
+ drop index if exists page_embeddings_embedding_idx;
2
+
3
+ drop function if exists related_posts;
4
+
5
+ drop table page_embeddings;