algoliasearch-jekyll 0.2.0 → 0.2.1
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/lib/push.rb +7 -13
- data/lib/record_extractor.rb +184 -0
- metadata +2 -1
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA1:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 3ab982386891076f076e8a420e4f7bce3ae6c4c3
|
4
|
+
data.tar.gz: 3ee4daebb86e545421496ee3b7d1badd1e7484ed
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 5a52c45e78b71da6d978fc8550c18350002a039b1433c5ece654992a3bcf508eadd7435dea2d977db359c2118f03fcaf2b02c2ef4da05db34cdb9c8c3c7af5e8
|
7
|
+
data.tar.gz: 127b1475369e4196e81eaac262da037e41cecfc9e2df38ea49dfa80aeebbe4300cd84865362891808152b8838b81b35cf7533c86809af02909a3af452a5a358a
|
data/lib/push.rb
CHANGED
@@ -41,9 +41,7 @@ class AlgoliaSearchJekyllPush < Jekyll::Command
|
|
41
41
|
|
42
42
|
# Exclude files manually excluded from config
|
43
43
|
excluded_files = @config['algolia']['excluded_files']
|
44
|
-
|
45
|
-
return false if excluded_files.include?(file.name)
|
46
|
-
end
|
44
|
+
return false if excluded_files && excluded_files.include?(file.name)
|
47
45
|
|
48
46
|
true
|
49
47
|
end
|
@@ -60,6 +58,7 @@ class AlgoliaSearchJekyllPush < Jekyll::Command
|
|
60
58
|
|
61
59
|
new_items = AlgoliaSearchRecordExtractor.new(file).extract
|
62
60
|
next if new_items.nil?
|
61
|
+
|
63
62
|
items += new_items
|
64
63
|
end
|
65
64
|
AlgoliaSearchJekyllPush.push(items)
|
@@ -96,7 +95,7 @@ class AlgoliaSearchJekyllPush < Jekyll::Command
|
|
96
95
|
exit 1
|
97
96
|
end
|
98
97
|
|
99
|
-
unless @config['algolia']['application_id']
|
98
|
+
unless @config['algolia'] && @config['algolia']['application_id']
|
100
99
|
Jekyll.logger.error 'Algolia Error: No application ID defined'
|
101
100
|
Jekyll.logger.warn ' Please set your application id in the '\
|
102
101
|
'_config.yml file, like so:'
|
@@ -124,13 +123,12 @@ class AlgoliaSearchJekyllPush < Jekyll::Command
|
|
124
123
|
Jekyll.logger.warn ' https://www.algolia.com/explorer'
|
125
124
|
exit 1
|
126
125
|
end
|
127
|
-
|
126
|
+
nil
|
128
127
|
end
|
129
128
|
|
130
129
|
# Get index settings
|
131
130
|
def configure_index(index)
|
132
131
|
settings = {
|
133
|
-
typoTolerance: true,
|
134
132
|
distinct: true,
|
135
133
|
attributeForDistinct: 'title',
|
136
134
|
attributesForFaceting: %w(tags type title),
|
@@ -149,18 +147,14 @@ class AlgoliaSearchJekyllPush < Jekyll::Command
|
|
149
147
|
css_selector
|
150
148
|
css_selector_parent
|
151
149
|
),
|
152
|
-
customRanking: ['desc(posted_at)', 'desc(
|
150
|
+
customRanking: ['desc(posted_at)', 'desc(weight)'],
|
153
151
|
highlightPreTag: '<span class="algolia__result-highlight">',
|
154
152
|
highlightPostTag: '</span>'
|
155
153
|
}
|
156
154
|
|
157
155
|
# Merge default settings with user custom ones
|
158
|
-
|
159
|
-
|
160
|
-
@config['algolia']['settings'].each do |key, value|
|
161
|
-
custom_settings[key.to_sym] = value
|
162
|
-
end
|
163
|
-
settings.merge!(custom_settings)
|
156
|
+
(@config['algolia']['settings'] || []).each do |key, value|
|
157
|
+
settings[key.to_sym] = value
|
164
158
|
end
|
165
159
|
|
166
160
|
index.set_settings(settings)
|
@@ -0,0 +1,184 @@
|
|
1
|
+
require 'algoliasearch'
|
2
|
+
require 'nokogiri'
|
3
|
+
require 'json'
|
4
|
+
|
5
|
+
# Given an HTML file as input, will return an array of records to index
|
6
|
+
class AlgoliaSearchRecordExtractor
|
7
|
+
def initialize(file)
|
8
|
+
@file = file
|
9
|
+
default_config = {
|
10
|
+
'record_css_selector' => 'p'
|
11
|
+
}
|
12
|
+
@config = default_config.merge(file.site.config['algolia'])
|
13
|
+
end
|
14
|
+
|
15
|
+
# Hook to modify a record after extracting
|
16
|
+
def custom_hook_each(item, _node)
|
17
|
+
item
|
18
|
+
end
|
19
|
+
|
20
|
+
# Hook to modify all records after extracting
|
21
|
+
def custom_hook_all(items)
|
22
|
+
items
|
23
|
+
end
|
24
|
+
|
25
|
+
# Returns metadata from the current file
|
26
|
+
def metadata
|
27
|
+
return metadata_page if @file.is_a?(Jekyll::Page)
|
28
|
+
return metadata_post if @file.is_a?(Jekyll::Post)
|
29
|
+
{}
|
30
|
+
end
|
31
|
+
|
32
|
+
# Extract a list of tags
|
33
|
+
def tags
|
34
|
+
return nil unless @file.respond_to? :tags
|
35
|
+
# Some plugins will extend the tags from simple strings to full featured
|
36
|
+
# objects. We'll simply call .to_s to always have a string
|
37
|
+
@file.tags.map(&:to_s)
|
38
|
+
end
|
39
|
+
|
40
|
+
# Extract metadata from a post
|
41
|
+
def metadata_post
|
42
|
+
{
|
43
|
+
type: 'post',
|
44
|
+
url: @file.url,
|
45
|
+
title: @file.title,
|
46
|
+
slug: @file.slug,
|
47
|
+
posted_at: @file.date.to_time.to_i,
|
48
|
+
tags: tags
|
49
|
+
}
|
50
|
+
end
|
51
|
+
|
52
|
+
# Extract metadata from a page
|
53
|
+
def metadata_page
|
54
|
+
{
|
55
|
+
type: 'page',
|
56
|
+
url: @file.url,
|
57
|
+
title: @file['title'],
|
58
|
+
slug: @file.basename
|
59
|
+
}
|
60
|
+
end
|
61
|
+
|
62
|
+
# Get the list of all HTML nodes to index
|
63
|
+
def html_nodes
|
64
|
+
document = Nokogiri::HTML(@file.content)
|
65
|
+
document.css(@config['record_css_selector'])
|
66
|
+
end
|
67
|
+
|
68
|
+
# Get the closest heading parent
|
69
|
+
def node_heading_parent(node, level = 'h7')
|
70
|
+
headings = %w(h1 h2 h3 h4 h5 h6)
|
71
|
+
|
72
|
+
# If initially called on a heading, we must not accept it but only accept
|
73
|
+
# strong headings
|
74
|
+
level = node.name if level == 'h7' && headings.include?(node.name)
|
75
|
+
|
76
|
+
previous = node.previous_element
|
77
|
+
|
78
|
+
# No previous element, we go up to the parent
|
79
|
+
unless previous
|
80
|
+
parent = node.parent
|
81
|
+
# No more parent, then no heading found
|
82
|
+
return nil if parent.name == 'body'
|
83
|
+
return node_heading_parent(parent, level)
|
84
|
+
end
|
85
|
+
|
86
|
+
# This is a heading, we return it
|
87
|
+
return previous if headings.include?(previous.name) && previous.name < level
|
88
|
+
|
89
|
+
node_heading_parent(previous, level)
|
90
|
+
end
|
91
|
+
|
92
|
+
# Get all the parent headings of the specified node
|
93
|
+
def node_hierarchy(node, memo = { level: 7 })
|
94
|
+
previous = node_heading_parent(node)
|
95
|
+
|
96
|
+
# No previous heading, we can stop the recursion
|
97
|
+
unless previous
|
98
|
+
memo.delete(:level)
|
99
|
+
return memo
|
100
|
+
end
|
101
|
+
|
102
|
+
tag_name = previous.name
|
103
|
+
level = tag_name.gsub('h', '').to_i
|
104
|
+
content = previous.content
|
105
|
+
|
106
|
+
# Skip if item already as title of a higher level
|
107
|
+
return node_hierarchy(previous, memo) if level >= memo[:level]
|
108
|
+
memo[:level] = level
|
109
|
+
|
110
|
+
# Add to the memo and continue
|
111
|
+
memo[tag_name.to_sym] = content
|
112
|
+
node_hierarchy(previous, memo)
|
113
|
+
end
|
114
|
+
|
115
|
+
# Return the raw HTML of the element to index
|
116
|
+
def node_raw_html(node)
|
117
|
+
node.to_s
|
118
|
+
end
|
119
|
+
|
120
|
+
# Return the text of the element, sanitized to be displayed
|
121
|
+
def node_text(node)
|
122
|
+
node.content.gsub('<', '<').gsub('>', '>')
|
123
|
+
end
|
124
|
+
|
125
|
+
# Returns a unique string of hierarchy from title to h6, used for distinct
|
126
|
+
def unique_hierarchy(data)
|
127
|
+
headings = %w(title h1 h2 h3 h4 h5 h6)
|
128
|
+
headings.map { |heading| data[heading.to_sym] }.compact.join(' > ')
|
129
|
+
end
|
130
|
+
|
131
|
+
# Returns a hash of two CSS selectors. One for the node itself, and one its
|
132
|
+
# closest heading parent
|
133
|
+
def node_css_selector(node)
|
134
|
+
return nil if node.nil?
|
135
|
+
|
136
|
+
# Use the CSS id if one is set
|
137
|
+
return "##{node['id']}" if node['id']
|
138
|
+
|
139
|
+
# Default Nokogiri selector
|
140
|
+
node.css_path.gsub('html > body > ', '')
|
141
|
+
end
|
142
|
+
|
143
|
+
# Returns a custom numeric value representing how relevant to its hierarchy
|
144
|
+
# this record is. This value can be used in the custom ranking to display more
|
145
|
+
# relevant records first.
|
146
|
+
def weight(data)
|
147
|
+
# Get list of unique words in headings
|
148
|
+
title_words = %i(title h1 h2 h3 h4 h5 h6)
|
149
|
+
.select { |title| data.key?(title) }
|
150
|
+
.map { |title| data[title].split(/\W+/) }
|
151
|
+
.flatten
|
152
|
+
.compact
|
153
|
+
.map(&:downcase)
|
154
|
+
.uniq
|
155
|
+
# Intersect words in headings with words in test
|
156
|
+
text_words = data[:text].downcase.split(/\W+/)
|
157
|
+
(title_words & text_words).size
|
158
|
+
end
|
159
|
+
|
160
|
+
def extract
|
161
|
+
items = []
|
162
|
+
html_nodes.each_with_index do |node, index|
|
163
|
+
next unless node.text.size > 0
|
164
|
+
|
165
|
+
item = metadata.clone
|
166
|
+
item[:objectID] = "#{item[:slug]}_#{index}"
|
167
|
+
item.merge!(node_hierarchy(node))
|
168
|
+
item[:tag_name] = node.name
|
169
|
+
item[:raw_html] = node_raw_html(node)
|
170
|
+
item[:text] = node_text(node)
|
171
|
+
item[:unique_hierarchy] = unique_hierarchy(item)
|
172
|
+
item[:css_selector] = node_css_selector(node)
|
173
|
+
item[:css_selector_parent] = node_css_selector(node_heading_parent(node))
|
174
|
+
item[:weight] = weight(item)
|
175
|
+
|
176
|
+
# We pass item through the user defined custom hook
|
177
|
+
item = custom_hook_each(item, node)
|
178
|
+
next if item.nil?
|
179
|
+
|
180
|
+
items << item
|
181
|
+
end
|
182
|
+
custom_hook_all(items)
|
183
|
+
end
|
184
|
+
end
|
metadata
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: algoliasearch-jekyll
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.2.
|
4
|
+
version: 0.2.1
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Tim Carry
|
@@ -117,6 +117,7 @@ extra_rdoc_files: []
|
|
117
117
|
files:
|
118
118
|
- lib/algoliasearch-jekyll.rb
|
119
119
|
- lib/push.rb
|
120
|
+
- lib/record_extractor.rb
|
120
121
|
homepage: https://github.com/algolia/algoliasearch-jekyll
|
121
122
|
licenses:
|
122
123
|
- MIT
|