algoliasearch-jekyll 0.2.0 → 0.2.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (4) hide show
  1. checksums.yaml +4 -4
  2. data/lib/push.rb +7 -13
  3. data/lib/record_extractor.rb +184 -0
  4. metadata +2 -1
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA1:
3
- metadata.gz: f1af115b167749491e0cfc3fbdfca8f7cbd0bfb1
4
- data.tar.gz: 6cdc40cf3148a33400178ba9bb9076095adebd31
3
+ metadata.gz: 3ab982386891076f076e8a420e4f7bce3ae6c4c3
4
+ data.tar.gz: 3ee4daebb86e545421496ee3b7d1badd1e7484ed
5
5
  SHA512:
6
- metadata.gz: 371291f704b4029819eb5dbb59de2e3ac2ac90c8973ae678fb1890824f3c9b4470782b1bfe8b7cbb528a374e80eeb4e274440c53a5561efc5bb3d703d4e19ead
7
- data.tar.gz: c508df8e04d78ae5db324ef678cb27ae46f5e067030a45a373dc399f25eca244263ba3bddf998e35b965fed31fb374d1e565014e6fe7a06631f4e4831811b097
6
+ metadata.gz: 5a52c45e78b71da6d978fc8550c18350002a039b1433c5ece654992a3bcf508eadd7435dea2d977db359c2118f03fcaf2b02c2ef4da05db34cdb9c8c3c7af5e8
7
+ data.tar.gz: 127b1475369e4196e81eaac262da037e41cecfc9e2df38ea49dfa80aeebbe4300cd84865362891808152b8838b81b35cf7533c86809af02909a3af452a5a358a
data/lib/push.rb CHANGED
@@ -41,9 +41,7 @@ class AlgoliaSearchJekyllPush < Jekyll::Command
41
41
 
42
42
  # Exclude files manually excluded from config
43
43
  excluded_files = @config['algolia']['excluded_files']
44
- unless excluded_files.nil?
45
- return false if excluded_files.include?(file.name)
46
- end
44
+ return false if excluded_files && excluded_files.include?(file.name)
47
45
 
48
46
  true
49
47
  end
@@ -60,6 +58,7 @@ class AlgoliaSearchJekyllPush < Jekyll::Command
60
58
 
61
59
  new_items = AlgoliaSearchRecordExtractor.new(file).extract
62
60
  next if new_items.nil?
61
+
63
62
  items += new_items
64
63
  end
65
64
  AlgoliaSearchJekyllPush.push(items)
@@ -96,7 +95,7 @@ class AlgoliaSearchJekyllPush < Jekyll::Command
96
95
  exit 1
97
96
  end
98
97
 
99
- unless @config['algolia']['application_id']
98
+ unless @config['algolia'] && @config['algolia']['application_id']
100
99
  Jekyll.logger.error 'Algolia Error: No application ID defined'
101
100
  Jekyll.logger.warn ' Please set your application id in the '\
102
101
  '_config.yml file, like so:'
@@ -124,13 +123,12 @@ class AlgoliaSearchJekyllPush < Jekyll::Command
124
123
  Jekyll.logger.warn ' https://www.algolia.com/explorer'
125
124
  exit 1
126
125
  end
127
- true
126
+ nil
128
127
  end
129
128
 
130
129
  # Get index settings
131
130
  def configure_index(index)
132
131
  settings = {
133
- typoTolerance: true,
134
132
  distinct: true,
135
133
  attributeForDistinct: 'title',
136
134
  attributesForFaceting: %w(tags type title),
@@ -149,18 +147,14 @@ class AlgoliaSearchJekyllPush < Jekyll::Command
149
147
  css_selector
150
148
  css_selector_parent
151
149
  ),
152
- customRanking: ['desc(posted_at)', 'desc(title_weight)'],
150
+ customRanking: ['desc(posted_at)', 'desc(weight)'],
153
151
  highlightPreTag: '<span class="algolia__result-highlight">',
154
152
  highlightPostTag: '</span>'
155
153
  }
156
154
 
157
155
  # Merge default settings with user custom ones
158
- if @config['algolia'].key?('settings')
159
- custom_settings = {}
160
- @config['algolia']['settings'].each do |key, value|
161
- custom_settings[key.to_sym] = value
162
- end
163
- settings.merge!(custom_settings)
156
+ (@config['algolia']['settings'] || []).each do |key, value|
157
+ settings[key.to_sym] = value
164
158
  end
165
159
 
166
160
  index.set_settings(settings)
@@ -0,0 +1,184 @@
1
+ require 'algoliasearch'
2
+ require 'nokogiri'
3
+ require 'json'
4
+
5
+ # Given an HTML file as input, will return an array of records to index
6
+ class AlgoliaSearchRecordExtractor
7
+ def initialize(file)
8
+ @file = file
9
+ default_config = {
10
+ 'record_css_selector' => 'p'
11
+ }
12
+ @config = default_config.merge(file.site.config['algolia'])
13
+ end
14
+
15
+ # Hook to modify a record after extracting
16
+ def custom_hook_each(item, _node)
17
+ item
18
+ end
19
+
20
+ # Hook to modify all records after extracting
21
+ def custom_hook_all(items)
22
+ items
23
+ end
24
+
25
+ # Returns metadata from the current file
26
+ def metadata
27
+ return metadata_page if @file.is_a?(Jekyll::Page)
28
+ return metadata_post if @file.is_a?(Jekyll::Post)
29
+ {}
30
+ end
31
+
32
+ # Extract a list of tags
33
+ def tags
34
+ return nil unless @file.respond_to? :tags
35
+ # Some plugins will extend the tags from simple strings to full featured
36
+ # objects. We'll simply call .to_s to always have a string
37
+ @file.tags.map(&:to_s)
38
+ end
39
+
40
+ # Extract metadata from a post
41
+ def metadata_post
42
+ {
43
+ type: 'post',
44
+ url: @file.url,
45
+ title: @file.title,
46
+ slug: @file.slug,
47
+ posted_at: @file.date.to_time.to_i,
48
+ tags: tags
49
+ }
50
+ end
51
+
52
+ # Extract metadata from a page
53
+ def metadata_page
54
+ {
55
+ type: 'page',
56
+ url: @file.url,
57
+ title: @file['title'],
58
+ slug: @file.basename
59
+ }
60
+ end
61
+
62
+ # Get the list of all HTML nodes to index
63
+ def html_nodes
64
+ document = Nokogiri::HTML(@file.content)
65
+ document.css(@config['record_css_selector'])
66
+ end
67
+
68
+ # Get the closest heading parent
69
+ def node_heading_parent(node, level = 'h7')
70
+ headings = %w(h1 h2 h3 h4 h5 h6)
71
+
72
+ # If initially called on a heading, we must not accept it but only accept
73
+ # strong headings
74
+ level = node.name if level == 'h7' && headings.include?(node.name)
75
+
76
+ previous = node.previous_element
77
+
78
+ # No previous element, we go up to the parent
79
+ unless previous
80
+ parent = node.parent
81
+ # No more parent, then no heading found
82
+ return nil if parent.name == 'body'
83
+ return node_heading_parent(parent, level)
84
+ end
85
+
86
+ # This is a heading, we return it
87
+ return previous if headings.include?(previous.name) && previous.name < level
88
+
89
+ node_heading_parent(previous, level)
90
+ end
91
+
92
+ # Get all the parent headings of the specified node
93
+ def node_hierarchy(node, memo = { level: 7 })
94
+ previous = node_heading_parent(node)
95
+
96
+ # No previous heading, we can stop the recursion
97
+ unless previous
98
+ memo.delete(:level)
99
+ return memo
100
+ end
101
+
102
+ tag_name = previous.name
103
+ level = tag_name.gsub('h', '').to_i
104
+ content = previous.content
105
+
106
+ # Skip if item already as title of a higher level
107
+ return node_hierarchy(previous, memo) if level >= memo[:level]
108
+ memo[:level] = level
109
+
110
+ # Add to the memo and continue
111
+ memo[tag_name.to_sym] = content
112
+ node_hierarchy(previous, memo)
113
+ end
114
+
115
+ # Return the raw HTML of the element to index
116
+ def node_raw_html(node)
117
+ node.to_s
118
+ end
119
+
120
+ # Return the text of the element, sanitized to be displayed
121
+ def node_text(node)
122
+ node.content.gsub('<', '&lt;').gsub('>', '&gt;')
123
+ end
124
+
125
+ # Returns a unique string of hierarchy from title to h6, used for distinct
126
+ def unique_hierarchy(data)
127
+ headings = %w(title h1 h2 h3 h4 h5 h6)
128
+ headings.map { |heading| data[heading.to_sym] }.compact.join(' > ')
129
+ end
130
+
131
+ # Returns a hash of two CSS selectors. One for the node itself, and one its
132
+ # closest heading parent
133
+ def node_css_selector(node)
134
+ return nil if node.nil?
135
+
136
+ # Use the CSS id if one is set
137
+ return "##{node['id']}" if node['id']
138
+
139
+ # Default Nokogiri selector
140
+ node.css_path.gsub('html > body > ', '')
141
+ end
142
+
143
+ # Returns a custom numeric value representing how relevant to its hierarchy
144
+ # this record is. This value can be used in the custom ranking to display more
145
+ # relevant records first.
146
+ def weight(data)
147
+ # Get list of unique words in headings
148
+ title_words = %i(title h1 h2 h3 h4 h5 h6)
149
+ .select { |title| data.key?(title) }
150
+ .map { |title| data[title].split(/\W+/) }
151
+ .flatten
152
+ .compact
153
+ .map(&:downcase)
154
+ .uniq
155
+ # Intersect words in headings with words in test
156
+ text_words = data[:text].downcase.split(/\W+/)
157
+ (title_words & text_words).size
158
+ end
159
+
160
+ def extract
161
+ items = []
162
+ html_nodes.each_with_index do |node, index|
163
+ next unless node.text.size > 0
164
+
165
+ item = metadata.clone
166
+ item[:objectID] = "#{item[:slug]}_#{index}"
167
+ item.merge!(node_hierarchy(node))
168
+ item[:tag_name] = node.name
169
+ item[:raw_html] = node_raw_html(node)
170
+ item[:text] = node_text(node)
171
+ item[:unique_hierarchy] = unique_hierarchy(item)
172
+ item[:css_selector] = node_css_selector(node)
173
+ item[:css_selector_parent] = node_css_selector(node_heading_parent(node))
174
+ item[:weight] = weight(item)
175
+
176
+ # We pass item through the user defined custom hook
177
+ item = custom_hook_each(item, node)
178
+ next if item.nil?
179
+
180
+ items << item
181
+ end
182
+ custom_hook_all(items)
183
+ end
184
+ end
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: algoliasearch-jekyll
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.2.0
4
+ version: 0.2.1
5
5
  platform: ruby
6
6
  authors:
7
7
  - Tim Carry
@@ -117,6 +117,7 @@ extra_rdoc_files: []
117
117
  files:
118
118
  - lib/algoliasearch-jekyll.rb
119
119
  - lib/push.rb
120
+ - lib/record_extractor.rb
120
121
  homepage: https://github.com/algolia/algoliasearch-jekyll
121
122
  licenses:
122
123
  - MIT