algoliasearch-jekyll 0.9.1 → 1.0.0.beta.pre.1
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/.travis.yml +3 -4
- data/CONTRIBUTING.md +8 -1
- data/Gemfile +4 -5
- data/README.md +318 -11
- data/Rakefile +7 -12
- data/algoliasearch-jekyll.gemspec +66 -62
- data/gemfiles/jekyll_v2.gemfile +3 -3
- data/gemfiles/jekyll_v3.gemfile +4 -4
- data/gemfiles/jekyll_v3_1_3.gemfile +24 -0
- data/gemfiles/jekyll_v3_1_6.gemfile +24 -0
- data/lib/algoliasearch-jekyll.rb +1 -3
- data/lib/credential_checker.rb +2 -1
- data/lib/error_handler.rb +6 -0
- data/lib/push.rb +81 -19
- data/lib/record_extractor.rb +120 -140
- data/lib/utils.rb +13 -0
- data/lib/version.rb +1 -1
- data/scripts/release +13 -12
- data/scripts/test_v3 +1 -1
- data/scripts/watch +4 -0
- data/spec/error_handler_spec.rb +17 -0
- data/spec/fixtures/jekyll_version_2/404.html +8 -0
- data/spec/fixtures/jekyll_version_2/404.md +9 -0
- data/spec/fixtures/jekyll_version_2/_my-collection/collection-item.md +3 -0
- data/spec/fixtures/jekyll_version_2/_posts/2015-07-02-test-post.md +1 -1
- data/spec/fixtures/jekyll_version_2/about.md +3 -0
- data/spec/fixtures/jekyll_version_2/front_matter.md +15 -0
- data/spec/fixtures/jekyll_version_2/index.html +3 -1
- data/spec/fixtures/jekyll_version_2/only-divs.md +15 -0
- data/spec/fixtures/jekyll_version_2/only-paragraphs.md +15 -0
- data/spec/fixtures/jekyll_version_3/404.html +8 -0
- data/spec/fixtures/jekyll_version_3/404.md +9 -0
- data/spec/fixtures/jekyll_version_3/_config.yml +1 -1
- data/spec/fixtures/jekyll_version_3/_my-collection/collection-item.md +3 -0
- data/spec/fixtures/jekyll_version_3/_posts/2015-07-02-test-post.md +1 -1
- data/spec/fixtures/jekyll_version_3/about.md +3 -0
- data/spec/fixtures/jekyll_version_3/front_matter.md +15 -0
- data/spec/fixtures/jekyll_version_3/index.html +4 -1
- data/spec/fixtures/jekyll_version_3/only-divs.md +15 -0
- data/spec/fixtures/jekyll_version_3/only-paragraphs.md +15 -0
- data/spec/push_spec.rb +211 -8
- data/spec/record_extractor_spec.rb +296 -358
- data/spec/spec_helper.rb +32 -11
- data/txt/record_too_big +19 -0
- metadata +40 -51
- data/scripts/watch +0 -1
data/gemfiles/jekyll_v2.gemfile
CHANGED
@@ -5,8 +5,9 @@ source "http://rubygems.org"
|
|
5
5
|
gem "algoliasearch", "~> 1.4"
|
6
6
|
gem "appraisal", "~> 2.1.0"
|
7
7
|
gem "awesome_print", "~> 1.6"
|
8
|
-
gem "json", "
|
9
|
-
gem "nokogiri",
|
8
|
+
gem "json", "~> 1.8"
|
9
|
+
gem "nokogiri", "~> 1.6"
|
10
|
+
gem "html-hierarchy-extractor", :path => "../../html-hierarchy-extractor/"
|
10
11
|
gem "verbal_expressions", "~> 0.1.5"
|
11
12
|
gem "jekyll", "~> 2.5"
|
12
13
|
|
@@ -19,5 +20,4 @@ group :development do
|
|
19
20
|
gem "rspec", "~> 3.0"
|
20
21
|
gem "rubocop", "~> 0.31"
|
21
22
|
gem "simplecov", "~> 0.10"
|
22
|
-
gem "rack", "< 2"
|
23
23
|
end
|
data/gemfiles/jekyll_v3.gemfile
CHANGED
@@ -5,10 +5,11 @@ source "http://rubygems.org"
|
|
5
5
|
gem "algoliasearch", "~> 1.4"
|
6
6
|
gem "appraisal", "~> 2.1.0"
|
7
7
|
gem "awesome_print", "~> 1.6"
|
8
|
-
gem "json", "
|
9
|
-
gem "nokogiri",
|
8
|
+
gem "json", "~> 1.8"
|
9
|
+
gem "nokogiri", "~> 1.6"
|
10
|
+
gem "html-hierarchy-extractor", :path => "../../html-hierarchy-extractor/"
|
10
11
|
gem "verbal_expressions", "~> 0.1.5"
|
11
|
-
gem "jekyll", "
|
12
|
+
gem "jekyll", "3.1.6"
|
12
13
|
gem "jekyll-paginate", "~> 1.1.0"
|
13
14
|
|
14
15
|
group :development do
|
@@ -20,5 +21,4 @@ group :development do
|
|
20
21
|
gem "rspec", "~> 3.0"
|
21
22
|
gem "rubocop", "~> 0.31"
|
22
23
|
gem "simplecov", "~> 0.10"
|
23
|
-
gem "rack", "< 2"
|
24
24
|
end
|
@@ -0,0 +1,24 @@
|
|
1
|
+
# This file was generated by Appraisal
|
2
|
+
|
3
|
+
source "http://rubygems.org"
|
4
|
+
|
5
|
+
gem "algoliasearch", "~> 1.4"
|
6
|
+
gem "appraisal", "~> 2.1.0"
|
7
|
+
gem "awesome_print", "~> 1.6"
|
8
|
+
gem "json", "~> 1.8"
|
9
|
+
gem "nokogiri", "~> 1.6"
|
10
|
+
gem "html-hierarchy-extractor", :path => "../../html-hierarchy-extractor/"
|
11
|
+
gem "verbal_expressions", "~> 0.1.5"
|
12
|
+
gem "jekyll", "3.1.3"
|
13
|
+
gem "jekyll-paginate", "~> 1.1.0"
|
14
|
+
|
15
|
+
group :development do
|
16
|
+
gem "coveralls", "~> 0.8"
|
17
|
+
gem "flay", "~> 2.6"
|
18
|
+
gem "flog", "~> 4.3"
|
19
|
+
gem "guard-rspec", "~> 4.6"
|
20
|
+
gem "jeweler", "~> 2.0"
|
21
|
+
gem "rspec", "~> 3.0"
|
22
|
+
gem "rubocop", "~> 0.31"
|
23
|
+
gem "simplecov", "~> 0.10"
|
24
|
+
end
|
@@ -0,0 +1,24 @@
|
|
1
|
+
# This file was generated by Appraisal
|
2
|
+
|
3
|
+
source "http://rubygems.org"
|
4
|
+
|
5
|
+
gem "algoliasearch", "~> 1.4"
|
6
|
+
gem "appraisal", "~> 2.1.0"
|
7
|
+
gem "awesome_print", "~> 1.6"
|
8
|
+
gem "json", "~> 1.8"
|
9
|
+
gem "nokogiri", "~> 1.6"
|
10
|
+
gem "html-hierarchy-extractor", :path => "../../html-hierarchy-extractor/"
|
11
|
+
gem "verbal_expressions", "~> 0.1.5"
|
12
|
+
gem "jekyll", "3.1.6"
|
13
|
+
gem "jekyll-paginate", "~> 1.1.0"
|
14
|
+
|
15
|
+
group :development do
|
16
|
+
gem "coveralls", "~> 0.8"
|
17
|
+
gem "flay", "~> 2.6"
|
18
|
+
gem "flog", "~> 4.3"
|
19
|
+
gem "guard-rspec", "~> 4.6"
|
20
|
+
gem "jeweler", "~> 2.0"
|
21
|
+
gem "rspec", "~> 3.0"
|
22
|
+
gem "rubocop", "~> 0.31"
|
23
|
+
gem "simplecov", "~> 0.10"
|
24
|
+
end
|
data/lib/algoliasearch-jekyll.rb
CHANGED
@@ -1,12 +1,10 @@
|
|
1
1
|
require 'rubygems'
|
2
2
|
require 'bundler/setup'
|
3
|
-
|
4
3
|
require 'awesome_print'
|
5
|
-
|
6
4
|
require_relative './version'
|
7
5
|
require_relative './push'
|
8
6
|
|
9
|
-
# `jekyll algolia`
|
7
|
+
# Registering the `jekyll algolia push` command
|
10
8
|
class AlgoliaSearchJekyll < Jekyll::Command
|
11
9
|
class << self
|
12
10
|
def init_with_program(prog)
|
data/lib/credential_checker.rb
CHANGED
@@ -3,7 +3,8 @@ require 'nokogiri'
|
|
3
3
|
require 'json'
|
4
4
|
require_relative './error_handler.rb'
|
5
5
|
|
6
|
-
#
|
6
|
+
# Will check that all the needed credentials are correctly given by the user
|
7
|
+
# before starting any push process
|
7
8
|
class AlgoliaSearchCredentialChecker
|
8
9
|
attr_accessor :config, :logger
|
9
10
|
|
data/lib/error_handler.rb
CHANGED
@@ -82,6 +82,12 @@ class AlgoliaSearchErrorHandler
|
|
82
82
|
return 'check_key_acl_to_tmp_index'
|
83
83
|
end
|
84
84
|
|
85
|
+
# Pushed record is above the 10KB limit
|
86
|
+
if error['http_error'] == 400 &&
|
87
|
+
error['json']['message'] =~ /^Record is too big/
|
88
|
+
return 'record_too_big'
|
89
|
+
end
|
90
|
+
|
85
91
|
false
|
86
92
|
end
|
87
93
|
end
|
data/lib/push.rb
CHANGED
@@ -1,12 +1,12 @@
|
|
1
1
|
require 'algoliasearch'
|
2
|
-
require 'nokogiri'
|
3
2
|
require 'json'
|
3
|
+
require 'nokogiri'
|
4
4
|
require_relative './version'
|
5
5
|
require_relative './record_extractor'
|
6
6
|
require_relative './credential_checker'
|
7
7
|
require_relative './error_handler'
|
8
8
|
|
9
|
-
# `jekyll algolia push` command
|
9
|
+
# `jekyll algolia push` main command
|
10
10
|
class AlgoliaSearchJekyllPush < Jekyll::Command
|
11
11
|
class << self
|
12
12
|
attr_accessor :options, :config
|
@@ -22,30 +22,42 @@ class AlgoliaSearchJekyllPush < Jekyll::Command
|
|
22
22
|
@args = args
|
23
23
|
@options = options
|
24
24
|
@config = config
|
25
|
+
@checker = AlgoliaSearchCredentialChecker.new(@config)
|
25
26
|
@is_verbose = @config['verbose']
|
26
27
|
@is_dry_run = @config['dry_run']
|
28
|
+
@is_lazy_update = lazy_update?
|
27
29
|
|
28
30
|
self
|
29
31
|
end
|
30
32
|
|
33
|
+
# Check if the lazy update feature is enabled or not (default to false)
|
34
|
+
def lazy_update?
|
35
|
+
return false unless @config['algolia']
|
36
|
+
return true if @config['algolia']['lazy_update']
|
37
|
+
false
|
38
|
+
end
|
39
|
+
|
31
40
|
# Check if the specified file should be indexed (we exclude static files,
|
32
41
|
# robots.txt and custom defined exclusions).
|
33
42
|
def indexable?(file)
|
43
|
+
# Excluding all static assets (images, fonts, etc)
|
34
44
|
return false if file.is_a?(Jekyll::StaticFile)
|
35
45
|
|
36
|
-
|
37
|
-
|
38
|
-
|
39
|
-
# Keep only markdown and html files
|
46
|
+
# Jekyll auto-converts markdown to HTML, so if the file is neither
|
47
|
+
# markdown or HTML, we should probably not index it
|
40
48
|
allowed_extensions = %w(html)
|
41
49
|
if @config['markdown_ext']
|
42
50
|
allowed_extensions += @config['markdown_ext'].split(',')
|
43
51
|
end
|
44
|
-
|
45
|
-
|
46
|
-
end
|
47
|
-
return false unless allowed_extensions.include?(extname)
|
52
|
+
extname = File.extname(File.basename(file.path))
|
53
|
+
return false unless allowed_extensions.include?(extname[1..-1])
|
48
54
|
|
55
|
+
# We should not index GitHub pages 404 pages
|
56
|
+
# https://help.github.com/articles/creating-a-custom-404-page-for-your-github-pages-site/
|
57
|
+
basename_no_ext = File.basename(file.path, extname)
|
58
|
+
return false if basename_no_ext == '404'
|
59
|
+
|
60
|
+
# Users can also define their own blacklist and hooks to exclude files
|
49
61
|
return false if excluded_file?(file)
|
50
62
|
|
51
63
|
true
|
@@ -53,14 +65,17 @@ class AlgoliaSearchJekyllPush < Jekyll::Command
|
|
53
65
|
|
54
66
|
# Check if the file is in the list of excluded files
|
55
67
|
def excluded_file?(file)
|
68
|
+
# Blacklist of pages generated by Jekyll that we know should not be
|
69
|
+
# indexing
|
56
70
|
excluded = [
|
57
|
-
|
71
|
+
/^index\.html$/, # Index page
|
72
|
+
%r{^page([0-9]*)/index\.html} # Pagination pages
|
58
73
|
]
|
74
|
+
# User-provided blacklist
|
59
75
|
if @config['algolia']
|
60
76
|
excluded += (@config['algolia']['excluded_files'] || [])
|
61
77
|
end
|
62
78
|
|
63
|
-
# Exclude files explicitly excluded in _config
|
64
79
|
excluded.each do |pattern|
|
65
80
|
pattern = /#{Regexp.quote(pattern)}/ if pattern.is_a? String
|
66
81
|
return true if file.path =~ pattern
|
@@ -88,14 +103,17 @@ class AlgoliaSearchJekyllPush < Jekyll::Command
|
|
88
103
|
items = []
|
89
104
|
is_verbose = config['verbose']
|
90
105
|
each_site_file do |file|
|
106
|
+
# Skip files that should not be indexed
|
91
107
|
next unless AlgoliaSearchJekyllPush.indexable?(file)
|
92
108
|
Jekyll.logger.info "Extracting data from #{file.path}" if is_verbose
|
109
|
+
|
93
110
|
new_items = AlgoliaSearchRecordExtractor.new(file).extract
|
94
111
|
next if new_items.nil?
|
95
112
|
ap new_items if is_verbose
|
96
113
|
|
97
114
|
items += new_items
|
98
115
|
end
|
116
|
+
|
99
117
|
AlgoliaSearchJekyllPush.push(items)
|
100
118
|
end
|
101
119
|
|
@@ -178,14 +196,11 @@ class AlgoliaSearchJekyllPush < Jekyll::Command
|
|
178
196
|
end
|
179
197
|
end
|
180
198
|
|
181
|
-
|
182
|
-
|
183
|
-
|
184
|
-
|
185
|
-
Jekyll.logger.info '=== DRY RUN ===' if @is_dry_run
|
186
|
-
|
199
|
+
# Greedy update will push all the records to a temporary index, then
|
200
|
+
# override the existing index with this temp one
|
201
|
+
def greedy_update(items)
|
187
202
|
# Add items to a temp index, then rename it
|
188
|
-
index_name = checker.index_name
|
203
|
+
index_name = @checker.index_name
|
189
204
|
index_name_tmp = "#{index_name}_tmp"
|
190
205
|
batch_add_items(items, create_index(index_name_tmp))
|
191
206
|
Algolia.move_index(index_name_tmp, index_name) unless @is_dry_run
|
@@ -193,5 +208,52 @@ class AlgoliaSearchJekyllPush < Jekyll::Command
|
|
193
208
|
Jekyll.logger.info "Indexing of #{items.size} items " \
|
194
209
|
"in #{index_name} done."
|
195
210
|
end
|
211
|
+
|
212
|
+
# Lazy update will minimize the number of operations by only pushing new
|
213
|
+
# data and deleting old data
|
214
|
+
def lazy_update(items)
|
215
|
+
index = create_index(@checker.index_name)
|
216
|
+
remote = remote_ids(index)
|
217
|
+
local = items.map { |item| item[:objectID] }
|
218
|
+
|
219
|
+
delete_remote_not_in_local(index, local, remote)
|
220
|
+
|
221
|
+
add_local_not_in_remote(index, items, local, remote)
|
222
|
+
end
|
223
|
+
|
224
|
+
# Array of all objectID in the remote index
|
225
|
+
def remote_ids(index)
|
226
|
+
list = []
|
227
|
+
index.browse(attributesToRetrieve: 'objectID') do |hit|
|
228
|
+
list << hit['objectID']
|
229
|
+
end
|
230
|
+
list
|
231
|
+
end
|
232
|
+
|
233
|
+
# Delete all remote items that are no longer in the local items
|
234
|
+
def delete_remote_not_in_local(index, local, remote)
|
235
|
+
list = remote - local
|
236
|
+
Jekyll.logger.info "Deleting #{list.size} items"
|
237
|
+
index.delete_objects!(list) unless list.empty?
|
238
|
+
end
|
239
|
+
|
240
|
+
# Push all local items that are not yet in the index
|
241
|
+
def add_local_not_in_remote(index, items, local, remote)
|
242
|
+
list = local - remote
|
243
|
+
return Jekyll.logger.info "Adding #{list.size} items" if list.empty?
|
244
|
+
items_to_push = items.select do |item|
|
245
|
+
list.include?(item[:objectID])
|
246
|
+
end
|
247
|
+
batch_add_items(items_to_push, index)
|
248
|
+
end
|
249
|
+
|
250
|
+
def push(items)
|
251
|
+
checker = AlgoliaSearchCredentialChecker.new(@config)
|
252
|
+
checker.assert_valid
|
253
|
+
|
254
|
+
Jekyll.logger.info '=== DRY RUN ===' if @is_dry_run
|
255
|
+
|
256
|
+
@is_lazy_update ? lazy_update(items) : greedy_update(items)
|
257
|
+
end
|
196
258
|
end
|
197
259
|
end
|
data/lib/record_extractor.rb
CHANGED
@@ -1,6 +1,8 @@
|
|
1
1
|
require 'algoliasearch'
|
2
2
|
require 'nokogiri'
|
3
3
|
require 'json'
|
4
|
+
require 'html-hierarchy-extractor'
|
5
|
+
require_relative './utils'
|
4
6
|
|
5
7
|
# Given an HTML file as input, will return an array of records to index
|
6
8
|
class AlgoliaSearchRecordExtractor
|
@@ -25,189 +27,167 @@ class AlgoliaSearchRecordExtractor
|
|
25
27
|
items
|
26
28
|
end
|
27
29
|
|
28
|
-
|
29
|
-
|
30
|
-
|
31
|
-
|
30
|
+
##
|
31
|
+
# Return the type of the Jekyll element
|
32
|
+
# It can be either page, post or document
|
33
|
+
def type
|
34
|
+
classname = @file.class.name
|
35
|
+
subclass = classname.split('::')[1]
|
36
|
+
type = subclass.downcase
|
32
37
|
|
33
|
-
|
34
|
-
|
38
|
+
# In Jekyll v2, Page, Post and Document have their own class
|
39
|
+
return type if AlgoliaSearchUtils.restrict_jekyll_version(less_than: '3.0')
|
35
40
|
|
36
|
-
|
41
|
+
# In Jekyll v3, Post are actually a specific type of Documents
|
42
|
+
if type == 'document'
|
43
|
+
collection_name = @file.collection.label
|
44
|
+
return 'post' if collection_name == 'posts'
|
45
|
+
end
|
37
46
|
|
38
|
-
|
39
|
-
|
47
|
+
type
|
48
|
+
end
|
40
49
|
|
41
|
-
|
50
|
+
##
|
51
|
+
# Return the url of the page
|
52
|
+
def url
|
53
|
+
@file.url
|
42
54
|
end
|
43
55
|
|
56
|
+
##
|
57
|
+
# Return the title of the page
|
58
|
+
def title
|
59
|
+
@file.data['title']
|
60
|
+
end
|
61
|
+
|
62
|
+
##
|
44
63
|
# Returns the slug of the document
|
45
64
|
def slug
|
46
|
-
#
|
47
|
-
return @file.data['slug'] if @file.data.key?('slug')
|
48
|
-
# Old Jekyll v2 has it at the root
|
49
|
-
return @file.slug if @file.respond_to? :slug
|
50
|
-
# Otherwise, we guess it from the filename
|
65
|
+
# We can guess the slug from the filename for all documents
|
51
66
|
basename = File.basename(@file.path)
|
52
67
|
extname = File.extname(basename)
|
53
|
-
File.basename(basename, extname)
|
54
|
-
end
|
55
|
-
|
56
|
-
# Extract a list of tags
|
57
|
-
def tags
|
58
|
-
tags = nil
|
68
|
+
slug = File.basename(basename, extname)
|
59
69
|
|
60
|
-
# Jekyll v3
|
61
|
-
if @file.data.key?('
|
62
|
-
tags = @file.data['tags']
|
63
|
-
elsif @file.respond_to? :tags
|
64
|
-
tags = @file.tags
|
65
|
-
end
|
66
|
-
|
67
|
-
return tags if tags.nil?
|
70
|
+
# Jekyll v3 posts have it in data
|
71
|
+
return @file.data['slug'] if @file.data.key?('slug')
|
68
72
|
|
69
|
-
#
|
70
|
-
|
71
|
-
tags.map(&:to_s)
|
72
|
-
end
|
73
|
+
# Jekyll v2 posts have a specific slug method
|
74
|
+
return @file.slug if @file.respond_to?(:slug)
|
73
75
|
|
74
|
-
|
75
|
-
def html_nodes
|
76
|
-
document = Nokogiri::HTML(@file.content)
|
77
|
-
document.css(@config['record_css_selector'])
|
76
|
+
slug
|
78
77
|
end
|
79
78
|
|
80
|
-
|
81
|
-
|
82
|
-
|
83
|
-
|
79
|
+
##
|
80
|
+
# Get an array of tags of the document
|
81
|
+
def tags
|
82
|
+
tags = []
|
84
83
|
|
85
|
-
|
86
|
-
|
87
|
-
|
88
|
-
|
84
|
+
is_v2 = AlgoliaSearchUtils.restrict_jekyll_version(less_than: '3.0')
|
85
|
+
is_v3 = AlgoliaSearchUtils.restrict_jekyll_version(more_than: '3.0')
|
86
|
+
has_tags_method = @file.respond_to?(:tags)
|
87
|
+
has_tags_data = @file.data.key?('tags')
|
89
88
|
|
90
|
-
|
89
|
+
# Starting from Jekyll v3, all tags are in data['tags']
|
90
|
+
tags = @file.data['tags'] if is_v3 && has_tags_data
|
91
91
|
|
92
|
-
#
|
93
|
-
|
94
|
-
|
95
|
-
|
96
|
-
return nil if parent.name == 'body'
|
97
|
-
return node_heading_parent(parent, level)
|
92
|
+
# In Jekyll v2, tags are in data['tags'], or in .tags
|
93
|
+
if is_v2
|
94
|
+
tags = @file.tags if has_tags_method
|
95
|
+
tags = @file.data['tags'] if tags.empty? && has_tags_data
|
98
96
|
end
|
99
97
|
|
100
|
-
#
|
101
|
-
|
102
|
-
|
103
|
-
node_heading_parent(previous, level)
|
98
|
+
# Some extension extends the tags with custom classes, so we make sure we
|
99
|
+
# cast them as strings
|
100
|
+
tags.map(&:to_s)
|
104
101
|
end
|
105
102
|
|
106
|
-
|
107
|
-
#
|
108
|
-
def
|
109
|
-
|
110
|
-
level = tag_name.delete('h').to_i
|
103
|
+
##
|
104
|
+
# Get the post date timestamp
|
105
|
+
def date
|
106
|
+
return nil unless @file.respond_to?(:date)
|
111
107
|
|
112
|
-
|
113
|
-
state[tag_name.to_sym] = node_text(node)
|
114
|
-
state[:level] = level
|
115
|
-
end
|
116
|
-
|
117
|
-
heading = node_heading_parent(node)
|
118
|
-
|
119
|
-
# No previous heading, we can stop the recursion
|
120
|
-
unless heading
|
121
|
-
state.delete(:level)
|
122
|
-
return state
|
123
|
-
end
|
124
|
-
|
125
|
-
node_hierarchy(heading, state)
|
108
|
+
@file.date.to_time.to_i
|
126
109
|
end
|
127
110
|
|
128
|
-
|
129
|
-
|
130
|
-
|
131
|
-
|
111
|
+
##
|
112
|
+
# Get the collection name of a document
|
113
|
+
def collection
|
114
|
+
return nil unless @file.respond_to?(:collection)
|
132
115
|
|
133
|
-
|
134
|
-
def node_text(node)
|
135
|
-
node.content.gsub('<', '<').gsub('>', '>')
|
136
|
-
end
|
116
|
+
collection_name = @file.collection.label
|
137
117
|
|
138
|
-
|
139
|
-
|
140
|
-
|
141
|
-
headings.map { |heading| data[heading.to_sym] }.compact.join(' > ')
|
118
|
+
# In Jekyll v3, posts are actually a collection
|
119
|
+
return nil if collection_name == 'posts'
|
120
|
+
collection_name
|
142
121
|
end
|
143
122
|
|
144
|
-
|
145
|
-
#
|
146
|
-
def
|
147
|
-
|
123
|
+
##
|
124
|
+
# Get a hash of all front-matter data
|
125
|
+
def front_matter
|
126
|
+
raw_data = @file.data
|
148
127
|
|
149
|
-
#
|
150
|
-
|
151
|
-
|
152
|
-
|
153
|
-
|
154
|
-
end
|
128
|
+
# We clean some keys that will be handled by specific methods
|
129
|
+
attributes_to_remove = %w(title tags slug url date type)
|
130
|
+
attributes_to_remove.each do |attribute|
|
131
|
+
raw_data.delete(attribute)
|
132
|
+
end
|
155
133
|
|
156
|
-
|
157
|
-
|
158
|
-
|
159
|
-
|
160
|
-
|
161
|
-
.select { |title| data.key?(title) }
|
162
|
-
.map { |title| data[title].to_s.split(/\W+/) }
|
163
|
-
.flatten
|
164
|
-
.compact
|
165
|
-
.map(&:downcase)
|
166
|
-
.uniq
|
167
|
-
# Intersect words in headings with words in test
|
168
|
-
text_words = data[:text].downcase.split(/\W+/)
|
169
|
-
(title_words & text_words).size
|
170
|
-
end
|
134
|
+
# Convert to symbols
|
135
|
+
data = {}
|
136
|
+
raw_data.each do |key, value|
|
137
|
+
data[key.to_sym] = value
|
138
|
+
end
|
171
139
|
|
172
|
-
|
173
|
-
def weight_tag_name(item)
|
174
|
-
tag_name = item[:tag_name]
|
175
|
-
# No a heading, no weight
|
176
|
-
return 0 unless %w(h1 h2 h3 h4 h5 h6).include?(tag_name)
|
177
|
-
# h1: 100, h2: 90, ..., h6: 50
|
178
|
-
100 - (tag_name.delete('h').to_i - 1) * 10
|
140
|
+
data
|
179
141
|
end
|
180
142
|
|
181
|
-
|
182
|
-
|
183
|
-
|
184
|
-
|
185
|
-
|
186
|
-
position: index
|
143
|
+
##
|
144
|
+
# Get the list of all node data
|
145
|
+
def hierarchy_nodes
|
146
|
+
extractor_options = {
|
147
|
+
css_selector: @config['record_css_selector']
|
187
148
|
}
|
149
|
+
|
150
|
+
HTMLHierarchyExtractor.new(
|
151
|
+
@file.content,
|
152
|
+
options: extractor_options
|
153
|
+
).extract
|
188
154
|
end
|
189
155
|
|
156
|
+
# Extract all records from the page and return the list
|
190
157
|
def extract
|
158
|
+
# Getting all hierarchical nodes from the HTML input
|
159
|
+
raw_items = hierarchy_nodes
|
160
|
+
|
161
|
+
# Shared attributes relative to the page that all records will have
|
162
|
+
shared_attributes = {
|
163
|
+
type: type,
|
164
|
+
url: url,
|
165
|
+
title: title,
|
166
|
+
slug: slug,
|
167
|
+
date: date,
|
168
|
+
collection: collection,
|
169
|
+
tags: tags
|
170
|
+
}
|
171
|
+
# Remove empty attributes
|
172
|
+
shared_attributes = shared_attributes.delete_if do |_, value|
|
173
|
+
value.nil?
|
174
|
+
end
|
175
|
+
|
176
|
+
# Enriching with page metadata
|
191
177
|
items = []
|
192
|
-
|
193
|
-
|
194
|
-
|
195
|
-
item =
|
196
|
-
item
|
197
|
-
item
|
198
|
-
|
199
|
-
item
|
200
|
-
item[:unique_hierarchy] = unique_hierarchy(item)
|
201
|
-
item[:css_selector] = node_css_selector(node)
|
202
|
-
item[:css_selector_parent] = node_css_selector(node_heading_parent(node))
|
203
|
-
item[:weight] = weight(item, index)
|
204
|
-
|
205
|
-
# We pass item through the user defined custom hook
|
206
|
-
item = custom_hook_each(item, node)
|
178
|
+
raw_items.each do |raw_item|
|
179
|
+
nokogiri_node = raw_item[:node]
|
180
|
+
raw_item.delete(:node)
|
181
|
+
item = shared_attributes.merge(raw_item)
|
182
|
+
item[:objectID] = item[:uuid]
|
183
|
+
item.delete(:uuid)
|
184
|
+
|
185
|
+
item = custom_hook_each(item, nokogiri_node)
|
207
186
|
next if item.nil?
|
208
187
|
|
209
188
|
items << item
|
210
189
|
end
|
190
|
+
|
211
191
|
custom_hook_all(items)
|
212
192
|
end
|
213
193
|
end
|