perron 0.17.0 → 0.18.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (47) hide show
  1. checksums.yaml +4 -4
  2. data/Gemfile +1 -1
  3. data/Gemfile.lock +25 -2
  4. data/app/controllers/perron/searches_controller.rb +48 -0
  5. data/app/helpers/perron/feeds_helper.rb +7 -0
  6. data/app/helpers/perron/markdown_helper.rb +3 -3
  7. data/app/helpers/perron/meta_tags_helper.rb +17 -0
  8. data/bin/release +19 -4
  9. data/lib/generators/perron/templates/README.md.tt +31 -7
  10. data/lib/generators/perron/templates/initializer.rb.tt +8 -4
  11. data/lib/generators/rails/content/USAGE +28 -26
  12. data/lib/generators/rails/content/content_generator.rb +6 -7
  13. data/lib/generators/rails/content/templates/controller.rb.tt +1 -5
  14. data/lib/generators/rails/content/templates/model.rb.tt +3 -3
  15. data/lib/perron/collection.rb +10 -1
  16. data/lib/perron/configuration.rb +9 -4
  17. data/lib/perron/content/data.rb +6 -2
  18. data/lib/perron/data_source/class_methods.rb +58 -0
  19. data/lib/perron/data_source/helper_context.rb +20 -0
  20. data/lib/perron/data_source/item.rb +37 -0
  21. data/lib/perron/{data → data_source}/proxy.rb +1 -1
  22. data/lib/perron/data_source.rb +155 -0
  23. data/lib/perron/engine.rb +12 -0
  24. data/lib/perron/html_processor/syntax_highlight.rb +2 -0
  25. data/lib/perron/output_server.rb +7 -2
  26. data/lib/perron/relation.rb +51 -0
  27. data/lib/perron/resource/associations.rb +2 -2
  28. data/lib/perron/resource/class_methods.rb +10 -0
  29. data/lib/perron/resource/configuration.rb +8 -11
  30. data/lib/perron/resource/core.rb +11 -0
  31. data/lib/perron/resource/related/stop_words.rb +20 -20
  32. data/lib/perron/resource/related.rb +73 -52
  33. data/lib/perron/resource/scopes.rb +29 -0
  34. data/lib/perron/resource/searchable.rb +19 -0
  35. data/lib/perron/resource/sourceable.rb +2 -2
  36. data/lib/perron/resource/sweeper.rb +45 -0
  37. data/lib/perron/resource/table_of_content.rb +0 -18
  38. data/lib/perron/resource.rb +30 -20
  39. data/lib/perron/site.rb +3 -3
  40. data/lib/perron/tasks/build.rake +8 -1
  41. data/lib/perron/version.rb +1 -1
  42. data/lib/perron.rb +1 -0
  43. data/perron.gemspec +1 -0
  44. metadata +28 -6
  45. data/app/helpers/feeds_helper.rb +0 -5
  46. data/app/helpers/meta_tags_helper.rb +0 -15
  47. data/lib/perron/data.rb +0 -180
@@ -0,0 +1,37 @@
1
+ # frozen_string_literal: true
2
+
3
+ module Perron
4
+ class DataSource < SimpleDelegator
5
+ class Item
6
+ def initialize(attributes, identifier:)
7
+ @attributes = attributes.transform_keys(&:to_sym)
8
+ @identifier = identifier
9
+ end
10
+
11
+ def [](key) = @attributes[key.to_sym]
12
+
13
+ def association_value(key) = self[key]
14
+
15
+ def to_partial_path
16
+ @to_partial_path ||= begin
17
+ identifier = @identifier.to_s
18
+ collection = File.extname(identifier).present? ? File.basename(identifier, ".*") : identifier
19
+ element = ActiveSupport::Inflector.underscore(ActiveSupport::Inflector.singularize(File.basename(collection)))
20
+
21
+ File.join("content", collection, element)
22
+ end
23
+ end
24
+
25
+ def method_missing(method_name, *arguments, &block)
26
+ return super if !@attributes.key?(method_name) || arguments.any? || block
27
+
28
+ @attributes[method_name]
29
+ end
30
+
31
+ def respond_to_missing?(method_name, include_private = false)
32
+ @attributes.key?(method_name) || super
33
+ end
34
+ end
35
+ private_constant :Item
36
+ end
37
+ end
@@ -1,7 +1,7 @@
1
1
  # frozen_string_literal: true
2
2
 
3
3
  module Perron
4
- class Data
4
+ class DataSource
5
5
  class Proxy
6
6
  include Enumerable
7
7
 
@@ -0,0 +1,155 @@
1
+ # frozen_string_literal: true
2
+
3
+ require "csv"
4
+
5
+ require "perron/data_source/class_methods"
6
+ require "perron/data_source/item"
7
+ require "perron/data_source/helper_context"
8
+
9
+ module Perron
10
+ class DataSource < SimpleDelegator
11
+ include Enumerable
12
+
13
+ include Perron::DataSource::ClassMethods
14
+
15
+ def initialize(identifier)
16
+ @identifier = identifier
17
+ @file_path = self.class.path_for!(identifier)
18
+ @records = records
19
+
20
+ super(records)
21
+ end
22
+
23
+ def each(&block) = @records.each(&block)
24
+
25
+ def count = @records.count
26
+
27
+ def first(n = nil)
28
+ n ? @records.first(n) : @records.first
29
+ end
30
+
31
+ def last = @records.last
32
+
33
+ def [](index) = @records[index]
34
+
35
+ def size = @records.size
36
+ alias_method :length, :size
37
+
38
+ private
39
+
40
+ PARSER_METHODS = {
41
+ ".yml" => :parse_yaml, ".yaml" => :parse_yaml,
42
+ ".json" => :parse_json, ".csv" => :parse_csv
43
+ }.freeze
44
+ SUPPORTED_EXTENSIONS = PARSER_METHODS.keys
45
+
46
+ def records
47
+ content = rendered_from(@file_path)
48
+ data = parsed_from(content, @file_path)
49
+
50
+ unless data.is_a?(Array)
51
+ raise Errors::DataParseError, "Data in `#{@file_path}` must be an array of objects."
52
+ end
53
+
54
+ data.map.with_index do |item, index|
55
+ unless item.is_a?(Hash)
56
+ raise Errors::DataParseError, "Item at index #{index} in `#{@file_path}` must be a hash/object, got #{item.class}"
57
+ end
58
+
59
+ Item.new(item, identifier: @identifier)
60
+ end
61
+ end
62
+ # def records
63
+ # content = rendered_from(@file_path)
64
+ # data = parsed_from(content, @file_path)
65
+
66
+ # unless data.is_a?(Array)
67
+ # raise Errors::DataParseError, "Data in `#{@file_path}` must be an array of objects."
68
+ # end
69
+
70
+ # data.map { Item.new(it, identifier: @identifier) }
71
+ # end
72
+
73
+ def rendered_from(path)
74
+ raw_content = File.read(path)
75
+
76
+ render_erb(raw_content)
77
+ rescue NameError, ArgumentError, SyntaxError => error
78
+ raise Errors::DataParseError, "Failed to render ERB in `#{path}`: (#{error.class}) #{error.message}"
79
+ end
80
+
81
+ def parsed_from(content, path)
82
+ extension = File.extname(path)
83
+ parser_method = PARSER_METHODS.fetch(extension) do
84
+ raise Errors::UnsupportedDataFormatError, "Unsupported data format: #{extension}. Supported formats: #{SUPPORTED_EXTENSIONS.join(", ")}"
85
+ end
86
+
87
+ send(parser_method, content, path)
88
+ end
89
+ # def parsed_from(content, path)
90
+ # extension = File.extname(path)
91
+ # parser_method = PARSER_METHODS.fetch(extension) do
92
+ # raise Errors::UnsupportedDataFormatError, "Unsupported data format: #{extension}"
93
+ # end
94
+
95
+ # send(parser_method, content)
96
+ # rescue Psych::SyntaxError, JSON::ParserError, CSV::MalformedCSVError => error
97
+ # raise Errors::DataParseError, "Failed to parse data format in `#{path}`: (#{error.class}) #{error.message}"
98
+ # end
99
+
100
+ def render_erb(content) = ERB.new(content).result(HelperContext.instance.get_binding)
101
+
102
+ def parse_yaml(content, path)
103
+ YAML.safe_load(content, permitted_classes: [Symbol, Time], aliases: true)
104
+ rescue Psych::SyntaxError => error
105
+ line_info = error.line ? " at line #{error.line}" : ""
106
+ column_info = error.column ? ", column #{error.column}" : ""
107
+
108
+ raise Errors::DataParseError, "Invalid YAML syntax in `#{path}`#{line_info}#{column_info}: #{error.problem}"
109
+ end
110
+ # def parse_yaml(content)
111
+ # YAML.safe_load(content, permitted_classes: [Symbol, Time], aliases: true)
112
+ # end
113
+
114
+ def parse_json(content, path)
115
+ JSON.parse(content, symbolize_names: true)
116
+ rescue JSON::ParserError => error
117
+ line_match = error.message.match(/at line (\d+)/)
118
+ line_info = line_match ? " at line #{line_match[1]}" : ""
119
+
120
+ raise Errors::DataParseError, "Invalid JSON syntax in `#{path}`#{line_info}: #{error.message}"
121
+ end
122
+ # def parse_json(content)
123
+ # JSON.parse(content, symbolize_names: true)
124
+ # end
125
+
126
+ def parse_csv(content, path)
127
+ expected_headers = nil
128
+
129
+ CSV.new(content, headers: true, header_converters: :symbol).map.with_index do |row, index|
130
+ expected_headers ||= row.headers
131
+
132
+ if row.headers != expected_headers
133
+ missing = expected_headers - row.headers
134
+ extra = row.headers - expected_headers
135
+
136
+ error_parts = []
137
+ error_parts << "missing columns: #{missing.join(", ")}" if missing.any?
138
+ error_parts << "extra columns: #{extra.join(", ")}" if extra.any?
139
+
140
+ raise Errors::DataParseError, "Column mismatch in `#{path}` at row #{index + 2} (#{error_parts.join("; ")}). Expected: #{expected_headers.join(", ")}"
141
+ end
142
+
143
+ row.to_h
144
+ end
145
+ rescue CSV::MalformedCSVError => error
146
+ line_match = error.message.match(/line (\d+)/)
147
+ line_info = line_match ? " at line #{line_match[1]}" : ""
148
+
149
+ raise Errors::DataParseError, "Malformed CSV in `#{path}`#{line_info}: #{error.message}"
150
+ end
151
+ # def parse_csv(content)
152
+ # CSV.new(content, headers: true, header_converters: :symbol).to_a.map(&:to_h)
153
+ # end
154
+ end
155
+ end
data/lib/perron/engine.rb CHANGED
@@ -1,6 +1,7 @@
1
1
  # frozen_string_literal: true
2
2
 
3
3
  require "perron/output_server"
4
+ require "mata"
4
5
 
5
6
  module Perron
6
7
  class Engine < Rails::Engine
@@ -12,6 +13,17 @@ module Perron
12
13
  app.middleware.use Perron::OutputServer
13
14
  end
14
15
 
16
+ initializer "perron.configure_hmr", after: :load_config_initializers do |app|
17
+ if Rails.env.development? && Perron.configuration.live_reload
18
+ app.config.middleware.insert_before(
19
+ ActionDispatch::Static,
20
+ Mata,
21
+ watch: Perron.configuration.live_reload_watch_paths,
22
+ skip: Perron.configuration.live_reload_skip_paths
23
+ )
24
+ end
25
+ end
26
+
15
27
  rake_tasks do
16
28
  load File.expand_path("../tasks/build.rake", __FILE__)
17
29
  load File.expand_path("../tasks/clobber.rake", __FILE__)
@@ -7,6 +7,8 @@ module Perron
7
7
  class HtmlProcessor
8
8
  class SyntaxHighlight < HtmlProcessor::Base
9
9
  def process
10
+ Perron.deprecator.deprecation_warning(:syntax_highlight)
11
+
10
12
  @html.css('pre > code[class*="language-"]').each do |code_block|
11
13
  language = code_block[:class][/(?<=language-)\S+/]
12
14
 
@@ -27,21 +27,26 @@ module Perron
27
27
 
28
28
  def serve(file_path)
29
29
  content = File.read(file_path)
30
+ injected_content = inject_preview_indicator(content)
30
31
 
31
32
  [
32
33
  200,
33
34
 
34
35
  {
35
36
  "Content-Type" => "text/html; charset=utf-8",
36
- "Content-Length" => content.bytesize.to_s
37
+ "Content-Length" => injected_content.bytesize.to_s
37
38
  },
38
39
 
39
- [content]
40
+ [injected_content]
40
41
  ]
41
42
  end
42
43
 
43
44
  def enabled? = Dir.exist?(output_path)
44
45
 
46
+ def inject_preview_indicator(content)
47
+ content.gsub(/<title>(.*?)<\/title>/i, "<title>[PREVIEW] \\1</title>")
48
+ end
49
+
45
50
  def output_path
46
51
  @output_path ||= Rails.root.join(Perron.configuration.output)
47
52
  end
@@ -0,0 +1,51 @@
1
+ # frozen_string_literal: true
2
+
3
+ module Perron
4
+ class Relation < Array
5
+ def initialize(resources = [])
6
+ super
7
+ end
8
+
9
+ def where(**conditions)
10
+ filtered = select do |resource|
11
+ conditions.all? do |key, value|
12
+ key_value = resource.public_send(key)
13
+
14
+ if value.is_a?(Array)
15
+ value.map(&:to_s).include?(key_value.to_s)
16
+ else
17
+ key_value.to_s == value.to_s
18
+ end
19
+ end
20
+ end
21
+
22
+ Relation.new(filtered)
23
+ end
24
+
25
+ def limit(count) = Relation.new(first(count))
26
+
27
+ def offset(count) = Relation.new(drop(count))
28
+
29
+ def order(attribute, direction = :asc)
30
+ if attribute.is_a?(Hash)
31
+ attribute, direction = attribute.first
32
+ end
33
+
34
+ sorted = sort_by { it.public_send(attribute) }
35
+
36
+ Relation.new((direction == :desc) ? sorted.reverse : sorted)
37
+ end
38
+
39
+ def pluck(*attributes)
40
+ raise ArgumentError, "wrong number of arguments (given 0, expected 1+)" if attributes.empty?
41
+
42
+ map do |resource|
43
+ if attributes.size == 1
44
+ resource.public_send(attributes.first)
45
+ else
46
+ attributes.map { resource.public_send(it) }
47
+ end
48
+ end
49
+ end
50
+ end
51
+ end
@@ -66,7 +66,7 @@ module Perron
66
66
  def records_for_ids(associated_class, ids)
67
67
  ids = Array(ids)
68
68
 
69
- associated_class.all.select { ids.include?(it[:id]) || ids.include?(it["id"]) }
69
+ Perron::Relation.new(associated_class.all.select { ids.include?(it[:id]) || ids.include?(it["id"]) })
70
70
  end
71
71
 
72
72
  def records_for_foreign_key(associated_class, association_name, **options)
@@ -74,7 +74,7 @@ module Perron
74
74
  primary_key_method = options.fetch(:primary_key, :slug)
75
75
  lookup_value = public_send(primary_key_method)
76
76
 
77
- associated_class.all.select { it.association_value(foreign_key) == lookup_value }
77
+ Perron::Relation.new(associated_class.all.select { it.association_value(foreign_key) == lookup_value })
78
78
  end
79
79
 
80
80
  def inverse_association_name = self.class.name.demodulize.underscore
@@ -8,10 +8,20 @@ module Perron
8
8
  class_methods do
9
9
  def find(slug) = collection.find(slug, name.constantize)
10
10
 
11
+ def find!(slug) = collection.find!(slug, name.constantize)
12
+
11
13
  def all = collection.all(self)
12
14
 
15
+ def where(**conditions) = all.where(**conditions)
16
+
17
+ def limit(count) = all.limit(count)
18
+
19
+ def offset(count) = all.offset(count)
20
+
13
21
  def count = all.size
14
22
 
23
+ def order(attribute, direction = :asc) = all.order(attribute, direction)
24
+
15
25
  def first(n = nil)
16
26
  n ? all.first(n) : all[0]
17
27
  end
@@ -22,8 +22,6 @@ module Perron
22
22
  config.feeds.json.path = "feeds/#{collection.name.demodulize.parameterize}.json"
23
23
  config.feeds.json.max_items = 20
24
24
 
25
- config.linked_data = ActiveSupport::OrderedOptions.new
26
-
27
25
  config.related_posts = ActiveSupport::OrderedOptions.new
28
26
  config.related_posts.enabled = false
29
27
  config.related_posts.max = 5
@@ -39,18 +37,17 @@ module Perron
39
37
  end
40
38
 
41
39
  class Options < ActiveSupport::OrderedOptions
42
- def method_missing(name, *arguments)
43
- if name.to_s.end_with?("=")
44
- key = name.to_s.chomp("=").to_sym
45
- value = arguments.first
46
-
47
- return self[key].merge!(value) if self[key].is_a?(ActiveSupport::OrderedOptions) && value.is_a?(Hash)
40
+ def []=(key, value)
41
+ if self[key].is_a?(ActiveSupport::OrderedOptions) && value.is_a?(Hash)
42
+ self[key].merge!(value)
43
+ else
44
+ super
48
45
  end
49
-
50
- super
51
46
  end
52
47
 
53
- def respond_to_missing?(name, include_private = false) = super
48
+ def respond_to_missing?(name, include_private = false)
49
+ name.to_s.end_with?("=") || super
50
+ end
54
51
  end
55
52
  private_constant :Options
56
53
  end
@@ -10,6 +10,17 @@ module Perron
10
10
  def to_model = self
11
11
 
12
12
  def model_name = self.class.model_name
13
+
14
+ def association_value(key) = metadata[key]
15
+
16
+ def to_partial_path
17
+ @to_partial_path ||= begin
18
+ element = ActiveSupport::Inflector.underscore(ActiveSupport::Inflector.demodulize(self.class.model_name))
19
+ collection = ActiveSupport::Inflector.tableize(self.class.model_name)
20
+
21
+ File.join("content", collection, element)
22
+ end
23
+ end
13
24
  end
14
25
  end
15
26
  end
@@ -5,28 +5,28 @@ module Perron
5
5
  class Resource
6
6
  class Related
7
7
  module StopWords
8
+ ALL = Set[
9
+ "a", "about", "above", "after", "again", "against", "all", "am",
10
+ "an", "and", "any", "are", "as", "at", "be", "because", "been",
11
+ "before", "being", "below", "between", "both", "but", "by", "can",
12
+ "did", "do", "does", "doing", "down", "during", "each", "few",
13
+ "for", "from", "further", "had", "has", "have", "having", "he",
14
+ "her", "here", "hers", "herself", "him", "himself", "his", "how",
15
+ "i", "if", "in", "into", "is", "it", "its", "itself", "just",
16
+ "me", "more", "most", "my", "myself", "no", "nor", "not", "now",
17
+ "of", "off", "on", "once", "only", "or", "other", "our", "ours",
18
+ "ourselves", "out", "over", "own", "s", "same", "she", "should",
19
+ "so", "some", "such", "t", "than", "that", "the", "their",
20
+ "theirs", "them", "themselves", "then", "there", "these", "they",
21
+ "this", "those", "through", "to", "too", "under", "until", "up",
22
+ "very", "was", "we", "were", "what", "when", "where", "which",
23
+ "while", "who", "whom", "why", "will", "with", "you", "your",
24
+ "yours", "yourself", "yourselves"
25
+ ].freeze
26
+
8
27
  module_function
9
28
 
10
- def all
11
- Set[
12
- "a", "about", "above", "after", "again", "against", "all", "am",
13
- "an", "and", "any", "are", "as", "at", "be", "because", "been",
14
- "before", "being", "below", "between", "both", "but", "by", "can",
15
- "did", "do", "does", "doing", "down", "during", "each", "few",
16
- "for", "from", "further", "had", "has", "have", "having", "he",
17
- "her", "here", "hers", "herself", "him", "himself", "his", "how",
18
- "i", "if", "in", "into", "is", "it", "its", "itself", "just",
19
- "me", "more", "most", "my", "myself", "no", "nor", "not", "now",
20
- "of", "off", "on", "once", "only", "or", "other", "our", "ours",
21
- "ourselves", "out", "over", "own", "s", "same", "she", "should",
22
- "so", "some", "such", "t", "than", "that", "the", "their",
23
- "theirs", "them", "themselves", "then", "there", "these", "they",
24
- "this", "those", "through", "to", "too", "under", "until", "up",
25
- "very", "was", "we", "were", "what", "when", "where", "which",
26
- "while", "who", "whom", "why", "will", "with", "you", "your",
27
- "yours", "yourself", "yourselves"
28
- ]
29
- end
29
+ def all = ALL
30
30
  end
31
31
  end
32
32
  end
@@ -5,93 +5,114 @@ require "perron/resource/related/stop_words"
5
5
  module Perron
6
6
  module Site
7
7
  class Resource
8
+ # Finds related resources using TF-IDF cosine similarity.
9
+ #
10
+ # Pre-normalizes vectors so cosine similarity reduces to a dot product,
11
+ # then builds a symmetric similarity matrix once per collection.
12
+ # Results are cached at the class level so the O(n²) comparison
13
+ # is paid once, not once per resource.
8
14
  class Related
15
+ Cache = Struct.new(:resources, :similarity_matrix, :fingerprint)
16
+
17
+ @collection_caches = {}
18
+
19
+ def self.cache_for(collection_name)
20
+ clear_cache!(collection_name) if stale?(collection_name)
21
+ @collection_caches[collection_name] ||= Cache.new(nil, nil, content_fingerprint(collection_name))
22
+ end
23
+
24
+ def self.clear_cache!(collection_name)
25
+ @collection_caches.delete(collection_name)
26
+ end
27
+
28
+ def self.stale?(collection_name)
29
+ @collection_caches[collection_name]&.fingerprint != content_fingerprint(collection_name)
30
+ end
31
+
32
+ def self.content_fingerprint(collection_name)
33
+ path = File.join(Perron.configuration.input, collection_name)
34
+ files = Dir.glob(File.join(path, "**", "*.*"))
35
+ [files.size, files.map { File.mtime(it) }.max]
36
+ end
37
+
9
38
  def initialize(resource)
10
39
  @resource = resource
11
40
  @collection = resource.collection
41
+ @cache = self.class.cache_for(@collection.name)
12
42
  end
13
43
 
14
44
  def find(limit: 5)
15
- @collection.resources
45
+ scores = similarity_matrix[@resource.slug] || {}
46
+
47
+ resources
16
48
  .reject { it.slug == @resource.slug }
17
- .map { [it, cosine_similarities_for(@resource, it)] }
18
- .sort_by { |_, score| -score }
19
- .map(&:first)
49
+ .sort_by { -(scores[it.slug] || 0.0) }
20
50
  .first(limit)
21
51
  end
22
52
 
23
53
  private
24
54
 
25
- def cosine_similarities_for(resource_one, resource_two)
26
- first_vector = tfidf_vector_for(resource_one)
27
- second_vector = tfidf_vector_for(resource_two)
55
+ def resources = @cache.resources ||= @collection.resources
28
56
 
29
- return 0.0 if first_vector.empty? || second_vector.empty?
57
+ def similarity_matrix = @cache.similarity_matrix ||= build_similarity_matrix
30
58
 
31
- dot_product = 0.0
59
+ def build_similarity_matrix
60
+ vectors = resources.to_h { [it.slug, normalize(tfidf_vector_for(it))] }
61
+ matrix = Hash.new { |h, k| h[k] = {} }
32
62
 
33
- first_vector.each_key { dot_product += first_vector[it] * second_vector[it] if second_vector.key?(it) }
63
+ slugs = vectors.keys
64
+ slugs.each_with_index do |slug_a, i|
65
+ next if vectors[slug_a].empty?
34
66
 
35
- first_magnitude = Math.sqrt(first_vector.values.sum { it**2 })
36
- second_magnitude = Math.sqrt(second_vector.values.sum { it**2 })
37
- denominator = first_magnitude * second_magnitude
67
+ slugs[(i + 1)..].each do |slug_b|
68
+ next if vectors[slug_b].empty?
38
69
 
39
- return 0.0 if denominator.zero?
70
+ score = dot_product(vectors[slug_a], vectors[slug_b])
71
+ matrix[slug_a][slug_b] = score
72
+ matrix[slug_b][slug_a] = score
73
+ end
74
+ end
40
75
 
41
- dot_product / denominator
76
+ matrix
42
77
  end
43
78
 
44
- def tfidf_vector_for(target_resource)
45
- @tfidf_vectors ||= {}
46
-
47
- return @tfidf_vectors[target_resource] if @tfidf_vectors.key?(target_resource)
48
-
49
- tokens = tokenize_content(target_resource)
50
- token_count = tokens.size
51
-
52
- return {} if token_count.zero?
79
+ def dot_product(vec_a, vec_b)
80
+ score = 0.0
81
+ vec_a.each_key { score += vec_a[it] * vec_b[it] if vec_b.key?(it) }
82
+ score
83
+ end
53
84
 
54
- term_count = Hash.new(0)
85
+ def normalize(vector)
86
+ return {} if vector.empty?
55
87
 
56
- tokens.each { |token| term_count[token] += 1 }
88
+ magnitude = Math.sqrt(vector.values.sum { it**2 })
89
+ return {} if magnitude.zero?
57
90
 
58
- tfidf_vector = {}
91
+ vector.transform_values { it / magnitude }
92
+ end
59
93
 
60
- term_count.each do |term, count|
61
- terms = count.to_f / token_count
94
+ def tfidf_vector_for(resource)
95
+ tokens = tokenize(resource)
96
+ return {} if tokens.empty?
62
97
 
63
- tfidf_vector[term] = terms * inverse_document_frequency[term]
64
- end
98
+ token_count = tokens.size.to_f
65
99
 
66
- @tfidf_vectors[target_resource] = tfidf_vector
100
+ tokens.tally.to_h { |term, count| [term, (count / token_count) * inverse_document_frequency[term]] }
67
101
  end
68
102
 
69
- def tokenize_content(target_resource)
70
- @tokenized_content ||= {}
103
+ def tokenize(resource)
104
+ return [] if resource.content.blank?
71
105
 
72
- return @tokenized_content[target_resource] if @tokenized_content.key?(target_resource)
73
- return [] if target_resource.content.blank?
74
-
75
- content = target_resource.content.gsub(/<[^>]*>/, " ")
76
- tokens = content.downcase.scan(/\w+/).reject { StopWords.all.include?(it) || it.length < 3 }
77
-
78
- @tokenized_content[target_resource] = tokens
106
+ resource.content.gsub(/<[^>]*>/, " ").downcase.scan(/\w+/).reject { StopWords.all.include?(it) || it.length < 3 }
79
107
  end
80
108
 
81
109
  def inverse_document_frequency
82
110
  @inverse_document_frequency ||= begin
83
- resource_frequency = Hash.new(0)
84
-
85
- @collection.resources.each { tokenize_content(it).uniq.each { resource_frequency[it] += 1 } }
86
-
87
- frequencies = {}
88
- total_resources = @collection.resources.size
89
-
90
- resource_frequency.each do |term, frequency|
91
- frequencies[term] = Math.log(total_resources.to_f / (1 + frequency))
92
- end
111
+ doc_frequency = Hash.new(0)
112
+ resources.each { tokenize(it).uniq.each { doc_frequency[it] += 1 } }
93
113
 
94
- frequencies
114
+ total = resources.size.to_f
115
+ doc_frequency.transform_values { Math.log(total / (1 + it)) }
95
116
  end
96
117
  end
97
118
  end
@@ -0,0 +1,29 @@
1
+ # frozen_string_literal: true
2
+
3
+ module Perron
4
+ class Resource
5
+ module Scopes
6
+ extend ActiveSupport::Concern
7
+
8
+ class_methods do
9
+ def scope(name, body)
10
+ unless body.respond_to?(:call)
11
+ raise ArgumentError, "The scope body needs to be callable."
12
+ end
13
+
14
+ if respond_to?(name, true)
15
+ raise ArgumentError, "Cannot define scope :#{name} because it already exists."
16
+ end
17
+
18
+ singleton_class.define_method(name) do |*arguments|
19
+ instance_exec(*arguments, &body)
20
+ end
21
+
22
+ Perron::Relation.define_method(name) do |*arguments|
23
+ instance_exec(*arguments, &body)
24
+ end
25
+ end
26
+ end
27
+ end
28
+ end
29
+ end