algoliasearch-jekyll 0.2.0 → 0.2.1

Sign up to get free protection for your applications and to get access to all the features.
Files changed (4) hide show
  1. checksums.yaml +4 -4
  2. data/lib/push.rb +7 -13
  3. data/lib/record_extractor.rb +184 -0
  4. metadata +2 -1
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA1:
3
- metadata.gz: f1af115b167749491e0cfc3fbdfca8f7cbd0bfb1
4
- data.tar.gz: 6cdc40cf3148a33400178ba9bb9076095adebd31
3
+ metadata.gz: 3ab982386891076f076e8a420e4f7bce3ae6c4c3
4
+ data.tar.gz: 3ee4daebb86e545421496ee3b7d1badd1e7484ed
5
5
  SHA512:
6
- metadata.gz: 371291f704b4029819eb5dbb59de2e3ac2ac90c8973ae678fb1890824f3c9b4470782b1bfe8b7cbb528a374e80eeb4e274440c53a5561efc5bb3d703d4e19ead
7
- data.tar.gz: c508df8e04d78ae5db324ef678cb27ae46f5e067030a45a373dc399f25eca244263ba3bddf998e35b965fed31fb374d1e565014e6fe7a06631f4e4831811b097
6
+ metadata.gz: 5a52c45e78b71da6d978fc8550c18350002a039b1433c5ece654992a3bcf508eadd7435dea2d977db359c2118f03fcaf2b02c2ef4da05db34cdb9c8c3c7af5e8
7
+ data.tar.gz: 127b1475369e4196e81eaac262da037e41cecfc9e2df38ea49dfa80aeebbe4300cd84865362891808152b8838b81b35cf7533c86809af02909a3af452a5a358a
data/lib/push.rb CHANGED
@@ -41,9 +41,7 @@ class AlgoliaSearchJekyllPush < Jekyll::Command
41
41
 
42
42
  # Exclude files manually excluded from config
43
43
  excluded_files = @config['algolia']['excluded_files']
44
- unless excluded_files.nil?
45
- return false if excluded_files.include?(file.name)
46
- end
44
+ return false if excluded_files && excluded_files.include?(file.name)
47
45
 
48
46
  true
49
47
  end
@@ -60,6 +58,7 @@ class AlgoliaSearchJekyllPush < Jekyll::Command
60
58
 
61
59
  new_items = AlgoliaSearchRecordExtractor.new(file).extract
62
60
  next if new_items.nil?
61
+
63
62
  items += new_items
64
63
  end
65
64
  AlgoliaSearchJekyllPush.push(items)
@@ -96,7 +95,7 @@ class AlgoliaSearchJekyllPush < Jekyll::Command
96
95
  exit 1
97
96
  end
98
97
 
99
- unless @config['algolia']['application_id']
98
+ unless @config['algolia'] && @config['algolia']['application_id']
100
99
  Jekyll.logger.error 'Algolia Error: No application ID defined'
101
100
  Jekyll.logger.warn ' Please set your application id in the '\
102
101
  '_config.yml file, like so:'
@@ -124,13 +123,12 @@ class AlgoliaSearchJekyllPush < Jekyll::Command
124
123
  Jekyll.logger.warn ' https://www.algolia.com/explorer'
125
124
  exit 1
126
125
  end
127
- true
126
+ nil
128
127
  end
129
128
 
130
129
  # Get index settings
131
130
  def configure_index(index)
132
131
  settings = {
133
- typoTolerance: true,
134
132
  distinct: true,
135
133
  attributeForDistinct: 'title',
136
134
  attributesForFaceting: %w(tags type title),
@@ -149,18 +147,14 @@ class AlgoliaSearchJekyllPush < Jekyll::Command
149
147
  css_selector
150
148
  css_selector_parent
151
149
  ),
152
- customRanking: ['desc(posted_at)', 'desc(title_weight)'],
150
+ customRanking: ['desc(posted_at)', 'desc(weight)'],
153
151
  highlightPreTag: '<span class="algolia__result-highlight">',
154
152
  highlightPostTag: '</span>'
155
153
  }
156
154
 
157
155
  # Merge default settings with user custom ones
158
- if @config['algolia'].key?('settings')
159
- custom_settings = {}
160
- @config['algolia']['settings'].each do |key, value|
161
- custom_settings[key.to_sym] = value
162
- end
163
- settings.merge!(custom_settings)
156
+ (@config['algolia']['settings'] || []).each do |key, value|
157
+ settings[key.to_sym] = value
164
158
  end
165
159
 
166
160
  index.set_settings(settings)
@@ -0,0 +1,184 @@
1
+ require 'algoliasearch'
2
+ require 'nokogiri'
3
+ require 'json'
4
+
5
+ # Given an HTML file as input, will return an array of records to index
6
+ class AlgoliaSearchRecordExtractor
7
+ def initialize(file)
8
+ @file = file
9
+ default_config = {
10
+ 'record_css_selector' => 'p'
11
+ }
12
+ @config = default_config.merge(file.site.config['algolia'])
13
+ end
14
+
15
+ # Hook to modify a record after extracting
16
+ def custom_hook_each(item, _node)
17
+ item
18
+ end
19
+
20
+ # Hook to modify all records after extracting
21
+ def custom_hook_all(items)
22
+ items
23
+ end
24
+
25
+ # Returns metadata from the current file
26
+ def metadata
27
+ return metadata_page if @file.is_a?(Jekyll::Page)
28
+ return metadata_post if @file.is_a?(Jekyll::Post)
29
+ {}
30
+ end
31
+
32
+ # Extract a list of tags
33
+ def tags
34
+ return nil unless @file.respond_to? :tags
35
+ # Some plugins will extend the tags from simple strings to full featured
36
+ # objects. We'll simply call .to_s to always have a string
37
+ @file.tags.map(&:to_s)
38
+ end
39
+
40
+ # Extract metadata from a post
41
+ def metadata_post
42
+ {
43
+ type: 'post',
44
+ url: @file.url,
45
+ title: @file.title,
46
+ slug: @file.slug,
47
+ posted_at: @file.date.to_time.to_i,
48
+ tags: tags
49
+ }
50
+ end
51
+
52
+ # Extract metadata from a page
53
+ def metadata_page
54
+ {
55
+ type: 'page',
56
+ url: @file.url,
57
+ title: @file['title'],
58
+ slug: @file.basename
59
+ }
60
+ end
61
+
62
+ # Get the list of all HTML nodes to index
63
+ def html_nodes
64
+ document = Nokogiri::HTML(@file.content)
65
+ document.css(@config['record_css_selector'])
66
+ end
67
+
68
+ # Get the closest heading parent
69
+ def node_heading_parent(node, level = 'h7')
70
+ headings = %w(h1 h2 h3 h4 h5 h6)
71
+
72
+ # If initially called on a heading, we must not accept it but only accept
73
+ # strong headings
74
+ level = node.name if level == 'h7' && headings.include?(node.name)
75
+
76
+ previous = node.previous_element
77
+
78
+ # No previous element, we go up to the parent
79
+ unless previous
80
+ parent = node.parent
81
+ # No more parent, then no heading found
82
+ return nil if parent.name == 'body'
83
+ return node_heading_parent(parent, level)
84
+ end
85
+
86
+ # This is a heading, we return it
87
+ return previous if headings.include?(previous.name) && previous.name < level
88
+
89
+ node_heading_parent(previous, level)
90
+ end
91
+
92
+ # Get all the parent headings of the specified node
93
+ def node_hierarchy(node, memo = { level: 7 })
94
+ previous = node_heading_parent(node)
95
+
96
+ # No previous heading, we can stop the recursion
97
+ unless previous
98
+ memo.delete(:level)
99
+ return memo
100
+ end
101
+
102
+ tag_name = previous.name
103
+ level = tag_name.gsub('h', '').to_i
104
+ content = previous.content
105
+
106
+ # Skip if item already as title of a higher level
107
+ return node_hierarchy(previous, memo) if level >= memo[:level]
108
+ memo[:level] = level
109
+
110
+ # Add to the memo and continue
111
+ memo[tag_name.to_sym] = content
112
+ node_hierarchy(previous, memo)
113
+ end
114
+
115
+ # Return the raw HTML of the element to index
116
+ def node_raw_html(node)
117
+ node.to_s
118
+ end
119
+
120
+ # Return the text of the element, sanitized to be displayed
121
+ def node_text(node)
122
+ node.content.gsub('<', '&lt;').gsub('>', '&gt;')
123
+ end
124
+
125
+ # Returns a unique string of hierarchy from title to h6, used for distinct
126
+ def unique_hierarchy(data)
127
+ headings = %w(title h1 h2 h3 h4 h5 h6)
128
+ headings.map { |heading| data[heading.to_sym] }.compact.join(' > ')
129
+ end
130
+
131
+ # Returns a hash of two CSS selectors. One for the node itself, and one its
132
+ # closest heading parent
133
+ def node_css_selector(node)
134
+ return nil if node.nil?
135
+
136
+ # Use the CSS id if one is set
137
+ return "##{node['id']}" if node['id']
138
+
139
+ # Default Nokogiri selector
140
+ node.css_path.gsub('html > body > ', '')
141
+ end
142
+
143
+ # Returns a custom numeric value representing how relevant to its hierarchy
144
+ # this record is. This value can be used in the custom ranking to display more
145
+ # relevant records first.
146
+ def weight(data)
147
+ # Get list of unique words in headings
148
+ title_words = %i(title h1 h2 h3 h4 h5 h6)
149
+ .select { |title| data.key?(title) }
150
+ .map { |title| data[title].split(/\W+/) }
151
+ .flatten
152
+ .compact
153
+ .map(&:downcase)
154
+ .uniq
155
+ # Intersect words in headings with words in test
156
+ text_words = data[:text].downcase.split(/\W+/)
157
+ (title_words & text_words).size
158
+ end
159
+
160
+ def extract
161
+ items = []
162
+ html_nodes.each_with_index do |node, index|
163
+ next unless node.text.size > 0
164
+
165
+ item = metadata.clone
166
+ item[:objectID] = "#{item[:slug]}_#{index}"
167
+ item.merge!(node_hierarchy(node))
168
+ item[:tag_name] = node.name
169
+ item[:raw_html] = node_raw_html(node)
170
+ item[:text] = node_text(node)
171
+ item[:unique_hierarchy] = unique_hierarchy(item)
172
+ item[:css_selector] = node_css_selector(node)
173
+ item[:css_selector_parent] = node_css_selector(node_heading_parent(node))
174
+ item[:weight] = weight(item)
175
+
176
+ # We pass item through the user defined custom hook
177
+ item = custom_hook_each(item, node)
178
+ next if item.nil?
179
+
180
+ items << item
181
+ end
182
+ custom_hook_all(items)
183
+ end
184
+ end
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: algoliasearch-jekyll
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.2.0
4
+ version: 0.2.1
5
5
  platform: ruby
6
6
  authors:
7
7
  - Tim Carry
@@ -117,6 +117,7 @@ extra_rdoc_files: []
117
117
  files:
118
118
  - lib/algoliasearch-jekyll.rb
119
119
  - lib/push.rb
120
+ - lib/record_extractor.rb
120
121
  homepage: https://github.com/algolia/algoliasearch-jekyll
121
122
  licenses:
122
123
  - MIT