algoliasearch-jekyll 0.2.0 → 0.2.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/lib/push.rb +7 -13
- data/lib/record_extractor.rb +184 -0
- metadata +2 -1
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA1:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 3ab982386891076f076e8a420e4f7bce3ae6c4c3
|
4
|
+
data.tar.gz: 3ee4daebb86e545421496ee3b7d1badd1e7484ed
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 5a52c45e78b71da6d978fc8550c18350002a039b1433c5ece654992a3bcf508eadd7435dea2d977db359c2118f03fcaf2b02c2ef4da05db34cdb9c8c3c7af5e8
|
7
|
+
data.tar.gz: 127b1475369e4196e81eaac262da037e41cecfc9e2df38ea49dfa80aeebbe4300cd84865362891808152b8838b81b35cf7533c86809af02909a3af452a5a358a
|
data/lib/push.rb
CHANGED
@@ -41,9 +41,7 @@ class AlgoliaSearchJekyllPush < Jekyll::Command
|
|
41
41
|
|
42
42
|
# Exclude files manually excluded from config
|
43
43
|
excluded_files = @config['algolia']['excluded_files']
|
44
|
-
|
45
|
-
return false if excluded_files.include?(file.name)
|
46
|
-
end
|
44
|
+
return false if excluded_files && excluded_files.include?(file.name)
|
47
45
|
|
48
46
|
true
|
49
47
|
end
|
@@ -60,6 +58,7 @@ class AlgoliaSearchJekyllPush < Jekyll::Command
|
|
60
58
|
|
61
59
|
new_items = AlgoliaSearchRecordExtractor.new(file).extract
|
62
60
|
next if new_items.nil?
|
61
|
+
|
63
62
|
items += new_items
|
64
63
|
end
|
65
64
|
AlgoliaSearchJekyllPush.push(items)
|
@@ -96,7 +95,7 @@ class AlgoliaSearchJekyllPush < Jekyll::Command
|
|
96
95
|
exit 1
|
97
96
|
end
|
98
97
|
|
99
|
-
unless @config['algolia']['application_id']
|
98
|
+
unless @config['algolia'] && @config['algolia']['application_id']
|
100
99
|
Jekyll.logger.error 'Algolia Error: No application ID defined'
|
101
100
|
Jekyll.logger.warn ' Please set your application id in the '\
|
102
101
|
'_config.yml file, like so:'
|
@@ -124,13 +123,12 @@ class AlgoliaSearchJekyllPush < Jekyll::Command
|
|
124
123
|
Jekyll.logger.warn ' https://www.algolia.com/explorer'
|
125
124
|
exit 1
|
126
125
|
end
|
127
|
-
|
126
|
+
nil
|
128
127
|
end
|
129
128
|
|
130
129
|
# Get index settings
|
131
130
|
def configure_index(index)
|
132
131
|
settings = {
|
133
|
-
typoTolerance: true,
|
134
132
|
distinct: true,
|
135
133
|
attributeForDistinct: 'title',
|
136
134
|
attributesForFaceting: %w(tags type title),
|
@@ -149,18 +147,14 @@ class AlgoliaSearchJekyllPush < Jekyll::Command
|
|
149
147
|
css_selector
|
150
148
|
css_selector_parent
|
151
149
|
),
|
152
|
-
customRanking: ['desc(posted_at)', 'desc(
|
150
|
+
customRanking: ['desc(posted_at)', 'desc(weight)'],
|
153
151
|
highlightPreTag: '<span class="algolia__result-highlight">',
|
154
152
|
highlightPostTag: '</span>'
|
155
153
|
}
|
156
154
|
|
157
155
|
# Merge default settings with user custom ones
|
158
|
-
|
159
|
-
|
160
|
-
@config['algolia']['settings'].each do |key, value|
|
161
|
-
custom_settings[key.to_sym] = value
|
162
|
-
end
|
163
|
-
settings.merge!(custom_settings)
|
156
|
+
(@config['algolia']['settings'] || []).each do |key, value|
|
157
|
+
settings[key.to_sym] = value
|
164
158
|
end
|
165
159
|
|
166
160
|
index.set_settings(settings)
|
@@ -0,0 +1,184 @@
|
|
1
|
+
require 'algoliasearch'
|
2
|
+
require 'nokogiri'
|
3
|
+
require 'json'
|
4
|
+
|
5
|
+
# Given an HTML file as input, will return an array of records to index
|
6
|
+
class AlgoliaSearchRecordExtractor
|
7
|
+
def initialize(file)
|
8
|
+
@file = file
|
9
|
+
default_config = {
|
10
|
+
'record_css_selector' => 'p'
|
11
|
+
}
|
12
|
+
@config = default_config.merge(file.site.config['algolia'])
|
13
|
+
end
|
14
|
+
|
15
|
+
# Hook to modify a record after extracting
|
16
|
+
def custom_hook_each(item, _node)
|
17
|
+
item
|
18
|
+
end
|
19
|
+
|
20
|
+
# Hook to modify all records after extracting
|
21
|
+
def custom_hook_all(items)
|
22
|
+
items
|
23
|
+
end
|
24
|
+
|
25
|
+
# Returns metadata from the current file
|
26
|
+
def metadata
|
27
|
+
return metadata_page if @file.is_a?(Jekyll::Page)
|
28
|
+
return metadata_post if @file.is_a?(Jekyll::Post)
|
29
|
+
{}
|
30
|
+
end
|
31
|
+
|
32
|
+
# Extract a list of tags
|
33
|
+
def tags
|
34
|
+
return nil unless @file.respond_to? :tags
|
35
|
+
# Some plugins will extend the tags from simple strings to full featured
|
36
|
+
# objects. We'll simply call .to_s to always have a string
|
37
|
+
@file.tags.map(&:to_s)
|
38
|
+
end
|
39
|
+
|
40
|
+
# Extract metadata from a post
|
41
|
+
def metadata_post
|
42
|
+
{
|
43
|
+
type: 'post',
|
44
|
+
url: @file.url,
|
45
|
+
title: @file.title,
|
46
|
+
slug: @file.slug,
|
47
|
+
posted_at: @file.date.to_time.to_i,
|
48
|
+
tags: tags
|
49
|
+
}
|
50
|
+
end
|
51
|
+
|
52
|
+
# Extract metadata from a page
|
53
|
+
def metadata_page
|
54
|
+
{
|
55
|
+
type: 'page',
|
56
|
+
url: @file.url,
|
57
|
+
title: @file['title'],
|
58
|
+
slug: @file.basename
|
59
|
+
}
|
60
|
+
end
|
61
|
+
|
62
|
+
# Get the list of all HTML nodes to index
|
63
|
+
def html_nodes
|
64
|
+
document = Nokogiri::HTML(@file.content)
|
65
|
+
document.css(@config['record_css_selector'])
|
66
|
+
end
|
67
|
+
|
68
|
+
# Get the closest heading parent
|
69
|
+
def node_heading_parent(node, level = 'h7')
|
70
|
+
headings = %w(h1 h2 h3 h4 h5 h6)
|
71
|
+
|
72
|
+
# If initially called on a heading, we must not accept it but only accept
|
73
|
+
# strong headings
|
74
|
+
level = node.name if level == 'h7' && headings.include?(node.name)
|
75
|
+
|
76
|
+
previous = node.previous_element
|
77
|
+
|
78
|
+
# No previous element, we go up to the parent
|
79
|
+
unless previous
|
80
|
+
parent = node.parent
|
81
|
+
# No more parent, then no heading found
|
82
|
+
return nil if parent.name == 'body'
|
83
|
+
return node_heading_parent(parent, level)
|
84
|
+
end
|
85
|
+
|
86
|
+
# This is a heading, we return it
|
87
|
+
return previous if headings.include?(previous.name) && previous.name < level
|
88
|
+
|
89
|
+
node_heading_parent(previous, level)
|
90
|
+
end
|
91
|
+
|
92
|
+
# Get all the parent headings of the specified node
|
93
|
+
def node_hierarchy(node, memo = { level: 7 })
|
94
|
+
previous = node_heading_parent(node)
|
95
|
+
|
96
|
+
# No previous heading, we can stop the recursion
|
97
|
+
unless previous
|
98
|
+
memo.delete(:level)
|
99
|
+
return memo
|
100
|
+
end
|
101
|
+
|
102
|
+
tag_name = previous.name
|
103
|
+
level = tag_name.gsub('h', '').to_i
|
104
|
+
content = previous.content
|
105
|
+
|
106
|
+
# Skip if item already as title of a higher level
|
107
|
+
return node_hierarchy(previous, memo) if level >= memo[:level]
|
108
|
+
memo[:level] = level
|
109
|
+
|
110
|
+
# Add to the memo and continue
|
111
|
+
memo[tag_name.to_sym] = content
|
112
|
+
node_hierarchy(previous, memo)
|
113
|
+
end
|
114
|
+
|
115
|
+
# Return the raw HTML of the element to index
|
116
|
+
def node_raw_html(node)
|
117
|
+
node.to_s
|
118
|
+
end
|
119
|
+
|
120
|
+
# Return the text of the element, sanitized to be displayed
|
121
|
+
def node_text(node)
|
122
|
+
node.content.gsub('<', '<').gsub('>', '>')
|
123
|
+
end
|
124
|
+
|
125
|
+
# Returns a unique string of hierarchy from title to h6, used for distinct
|
126
|
+
def unique_hierarchy(data)
|
127
|
+
headings = %w(title h1 h2 h3 h4 h5 h6)
|
128
|
+
headings.map { |heading| data[heading.to_sym] }.compact.join(' > ')
|
129
|
+
end
|
130
|
+
|
131
|
+
# Returns a hash of two CSS selectors. One for the node itself, and one its
|
132
|
+
# closest heading parent
|
133
|
+
def node_css_selector(node)
|
134
|
+
return nil if node.nil?
|
135
|
+
|
136
|
+
# Use the CSS id if one is set
|
137
|
+
return "##{node['id']}" if node['id']
|
138
|
+
|
139
|
+
# Default Nokogiri selector
|
140
|
+
node.css_path.gsub('html > body > ', '')
|
141
|
+
end
|
142
|
+
|
143
|
+
# Returns a custom numeric value representing how relevant to its hierarchy
|
144
|
+
# this record is. This value can be used in the custom ranking to display more
|
145
|
+
# relevant records first.
|
146
|
+
def weight(data)
|
147
|
+
# Get list of unique words in headings
|
148
|
+
title_words = %i(title h1 h2 h3 h4 h5 h6)
|
149
|
+
.select { |title| data.key?(title) }
|
150
|
+
.map { |title| data[title].split(/\W+/) }
|
151
|
+
.flatten
|
152
|
+
.compact
|
153
|
+
.map(&:downcase)
|
154
|
+
.uniq
|
155
|
+
# Intersect words in headings with words in test
|
156
|
+
text_words = data[:text].downcase.split(/\W+/)
|
157
|
+
(title_words & text_words).size
|
158
|
+
end
|
159
|
+
|
160
|
+
def extract
|
161
|
+
items = []
|
162
|
+
html_nodes.each_with_index do |node, index|
|
163
|
+
next unless node.text.size > 0
|
164
|
+
|
165
|
+
item = metadata.clone
|
166
|
+
item[:objectID] = "#{item[:slug]}_#{index}"
|
167
|
+
item.merge!(node_hierarchy(node))
|
168
|
+
item[:tag_name] = node.name
|
169
|
+
item[:raw_html] = node_raw_html(node)
|
170
|
+
item[:text] = node_text(node)
|
171
|
+
item[:unique_hierarchy] = unique_hierarchy(item)
|
172
|
+
item[:css_selector] = node_css_selector(node)
|
173
|
+
item[:css_selector_parent] = node_css_selector(node_heading_parent(node))
|
174
|
+
item[:weight] = weight(item)
|
175
|
+
|
176
|
+
# We pass item through the user defined custom hook
|
177
|
+
item = custom_hook_each(item, node)
|
178
|
+
next if item.nil?
|
179
|
+
|
180
|
+
items << item
|
181
|
+
end
|
182
|
+
custom_hook_all(items)
|
183
|
+
end
|
184
|
+
end
|
metadata
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: algoliasearch-jekyll
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.2.
|
4
|
+
version: 0.2.1
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Tim Carry
|
@@ -117,6 +117,7 @@ extra_rdoc_files: []
|
|
117
117
|
files:
|
118
118
|
- lib/algoliasearch-jekyll.rb
|
119
119
|
- lib/push.rb
|
120
|
+
- lib/record_extractor.rb
|
120
121
|
homepage: https://github.com/algolia/algoliasearch-jekyll
|
121
122
|
licenses:
|
122
123
|
- MIT
|