algoliasearch-jekyll 0.9.1 → 1.0.0.beta.pre.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/.travis.yml +3 -4
- data/CONTRIBUTING.md +8 -1
- data/Gemfile +4 -5
- data/README.md +318 -11
- data/Rakefile +7 -12
- data/algoliasearch-jekyll.gemspec +66 -62
- data/gemfiles/jekyll_v2.gemfile +3 -3
- data/gemfiles/jekyll_v3.gemfile +4 -4
- data/gemfiles/jekyll_v3_1_3.gemfile +24 -0
- data/gemfiles/jekyll_v3_1_6.gemfile +24 -0
- data/lib/algoliasearch-jekyll.rb +1 -3
- data/lib/credential_checker.rb +2 -1
- data/lib/error_handler.rb +6 -0
- data/lib/push.rb +81 -19
- data/lib/record_extractor.rb +120 -140
- data/lib/utils.rb +13 -0
- data/lib/version.rb +1 -1
- data/scripts/release +13 -12
- data/scripts/test_v3 +1 -1
- data/scripts/watch +4 -0
- data/spec/error_handler_spec.rb +17 -0
- data/spec/fixtures/jekyll_version_2/404.html +8 -0
- data/spec/fixtures/jekyll_version_2/404.md +9 -0
- data/spec/fixtures/jekyll_version_2/_my-collection/collection-item.md +3 -0
- data/spec/fixtures/jekyll_version_2/_posts/2015-07-02-test-post.md +1 -1
- data/spec/fixtures/jekyll_version_2/about.md +3 -0
- data/spec/fixtures/jekyll_version_2/front_matter.md +15 -0
- data/spec/fixtures/jekyll_version_2/index.html +3 -1
- data/spec/fixtures/jekyll_version_2/only-divs.md +15 -0
- data/spec/fixtures/jekyll_version_2/only-paragraphs.md +15 -0
- data/spec/fixtures/jekyll_version_3/404.html +8 -0
- data/spec/fixtures/jekyll_version_3/404.md +9 -0
- data/spec/fixtures/jekyll_version_3/_config.yml +1 -1
- data/spec/fixtures/jekyll_version_3/_my-collection/collection-item.md +3 -0
- data/spec/fixtures/jekyll_version_3/_posts/2015-07-02-test-post.md +1 -1
- data/spec/fixtures/jekyll_version_3/about.md +3 -0
- data/spec/fixtures/jekyll_version_3/front_matter.md +15 -0
- data/spec/fixtures/jekyll_version_3/index.html +4 -1
- data/spec/fixtures/jekyll_version_3/only-divs.md +15 -0
- data/spec/fixtures/jekyll_version_3/only-paragraphs.md +15 -0
- data/spec/push_spec.rb +211 -8
- data/spec/record_extractor_spec.rb +296 -358
- data/spec/spec_helper.rb +32 -11
- data/txt/record_too_big +19 -0
- metadata +40 -51
- data/scripts/watch +0 -1
data/gemfiles/jekyll_v2.gemfile
CHANGED
@@ -5,8 +5,9 @@ source "http://rubygems.org"
|
|
5
5
|
gem "algoliasearch", "~> 1.4"
|
6
6
|
gem "appraisal", "~> 2.1.0"
|
7
7
|
gem "awesome_print", "~> 1.6"
|
8
|
-
gem "json", "
|
9
|
-
gem "nokogiri",
|
8
|
+
gem "json", "~> 1.8"
|
9
|
+
gem "nokogiri", "~> 1.6"
|
10
|
+
gem "html-hierarchy-extractor", :path => "../../html-hierarchy-extractor/"
|
10
11
|
gem "verbal_expressions", "~> 0.1.5"
|
11
12
|
gem "jekyll", "~> 2.5"
|
12
13
|
|
@@ -19,5 +20,4 @@ group :development do
|
|
19
20
|
gem "rspec", "~> 3.0"
|
20
21
|
gem "rubocop", "~> 0.31"
|
21
22
|
gem "simplecov", "~> 0.10"
|
22
|
-
gem "rack", "< 2"
|
23
23
|
end
|
data/gemfiles/jekyll_v3.gemfile
CHANGED
@@ -5,10 +5,11 @@ source "http://rubygems.org"
|
|
5
5
|
gem "algoliasearch", "~> 1.4"
|
6
6
|
gem "appraisal", "~> 2.1.0"
|
7
7
|
gem "awesome_print", "~> 1.6"
|
8
|
-
gem "json", "
|
9
|
-
gem "nokogiri",
|
8
|
+
gem "json", "~> 1.8"
|
9
|
+
gem "nokogiri", "~> 1.6"
|
10
|
+
gem "html-hierarchy-extractor", :path => "../../html-hierarchy-extractor/"
|
10
11
|
gem "verbal_expressions", "~> 0.1.5"
|
11
|
-
gem "jekyll", "
|
12
|
+
gem "jekyll", "3.1.6"
|
12
13
|
gem "jekyll-paginate", "~> 1.1.0"
|
13
14
|
|
14
15
|
group :development do
|
@@ -20,5 +21,4 @@ group :development do
|
|
20
21
|
gem "rspec", "~> 3.0"
|
21
22
|
gem "rubocop", "~> 0.31"
|
22
23
|
gem "simplecov", "~> 0.10"
|
23
|
-
gem "rack", "< 2"
|
24
24
|
end
|
@@ -0,0 +1,24 @@
|
|
1
|
+
# This file was generated by Appraisal
|
2
|
+
|
3
|
+
source "http://rubygems.org"
|
4
|
+
|
5
|
+
gem "algoliasearch", "~> 1.4"
|
6
|
+
gem "appraisal", "~> 2.1.0"
|
7
|
+
gem "awesome_print", "~> 1.6"
|
8
|
+
gem "json", "~> 1.8"
|
9
|
+
gem "nokogiri", "~> 1.6"
|
10
|
+
gem "html-hierarchy-extractor", :path => "../../html-hierarchy-extractor/"
|
11
|
+
gem "verbal_expressions", "~> 0.1.5"
|
12
|
+
gem "jekyll", "3.1.3"
|
13
|
+
gem "jekyll-paginate", "~> 1.1.0"
|
14
|
+
|
15
|
+
group :development do
|
16
|
+
gem "coveralls", "~> 0.8"
|
17
|
+
gem "flay", "~> 2.6"
|
18
|
+
gem "flog", "~> 4.3"
|
19
|
+
gem "guard-rspec", "~> 4.6"
|
20
|
+
gem "jeweler", "~> 2.0"
|
21
|
+
gem "rspec", "~> 3.0"
|
22
|
+
gem "rubocop", "~> 0.31"
|
23
|
+
gem "simplecov", "~> 0.10"
|
24
|
+
end
|
@@ -0,0 +1,24 @@
|
|
1
|
+
# This file was generated by Appraisal
|
2
|
+
|
3
|
+
source "http://rubygems.org"
|
4
|
+
|
5
|
+
gem "algoliasearch", "~> 1.4"
|
6
|
+
gem "appraisal", "~> 2.1.0"
|
7
|
+
gem "awesome_print", "~> 1.6"
|
8
|
+
gem "json", "~> 1.8"
|
9
|
+
gem "nokogiri", "~> 1.6"
|
10
|
+
gem "html-hierarchy-extractor", :path => "../../html-hierarchy-extractor/"
|
11
|
+
gem "verbal_expressions", "~> 0.1.5"
|
12
|
+
gem "jekyll", "3.1.6"
|
13
|
+
gem "jekyll-paginate", "~> 1.1.0"
|
14
|
+
|
15
|
+
group :development do
|
16
|
+
gem "coveralls", "~> 0.8"
|
17
|
+
gem "flay", "~> 2.6"
|
18
|
+
gem "flog", "~> 4.3"
|
19
|
+
gem "guard-rspec", "~> 4.6"
|
20
|
+
gem "jeweler", "~> 2.0"
|
21
|
+
gem "rspec", "~> 3.0"
|
22
|
+
gem "rubocop", "~> 0.31"
|
23
|
+
gem "simplecov", "~> 0.10"
|
24
|
+
end
|
data/lib/algoliasearch-jekyll.rb
CHANGED
@@ -1,12 +1,10 @@
|
|
1
1
|
require 'rubygems'
|
2
2
|
require 'bundler/setup'
|
3
|
-
|
4
3
|
require 'awesome_print'
|
5
|
-
|
6
4
|
require_relative './version'
|
7
5
|
require_relative './push'
|
8
6
|
|
9
|
-
# `jekyll algolia`
|
7
|
+
# Registering the `jekyll algolia push` command
|
10
8
|
class AlgoliaSearchJekyll < Jekyll::Command
|
11
9
|
class << self
|
12
10
|
def init_with_program(prog)
|
data/lib/credential_checker.rb
CHANGED
@@ -3,7 +3,8 @@ require 'nokogiri'
|
|
3
3
|
require 'json'
|
4
4
|
require_relative './error_handler.rb'
|
5
5
|
|
6
|
-
#
|
6
|
+
# Will check that all the needed credentials are correctly given by the user
|
7
|
+
# before starting any push process
|
7
8
|
class AlgoliaSearchCredentialChecker
|
8
9
|
attr_accessor :config, :logger
|
9
10
|
|
data/lib/error_handler.rb
CHANGED
@@ -82,6 +82,12 @@ class AlgoliaSearchErrorHandler
|
|
82
82
|
return 'check_key_acl_to_tmp_index'
|
83
83
|
end
|
84
84
|
|
85
|
+
# Pushed record is above the 10KB limit
|
86
|
+
if error['http_error'] == 400 &&
|
87
|
+
error['json']['message'] =~ /^Record is too big/
|
88
|
+
return 'record_too_big'
|
89
|
+
end
|
90
|
+
|
85
91
|
false
|
86
92
|
end
|
87
93
|
end
|
data/lib/push.rb
CHANGED
@@ -1,12 +1,12 @@
|
|
1
1
|
require 'algoliasearch'
|
2
|
-
require 'nokogiri'
|
3
2
|
require 'json'
|
3
|
+
require 'nokogiri'
|
4
4
|
require_relative './version'
|
5
5
|
require_relative './record_extractor'
|
6
6
|
require_relative './credential_checker'
|
7
7
|
require_relative './error_handler'
|
8
8
|
|
9
|
-
# `jekyll algolia push` command
|
9
|
+
# `jekyll algolia push` main command
|
10
10
|
class AlgoliaSearchJekyllPush < Jekyll::Command
|
11
11
|
class << self
|
12
12
|
attr_accessor :options, :config
|
@@ -22,30 +22,42 @@ class AlgoliaSearchJekyllPush < Jekyll::Command
|
|
22
22
|
@args = args
|
23
23
|
@options = options
|
24
24
|
@config = config
|
25
|
+
@checker = AlgoliaSearchCredentialChecker.new(@config)
|
25
26
|
@is_verbose = @config['verbose']
|
26
27
|
@is_dry_run = @config['dry_run']
|
28
|
+
@is_lazy_update = lazy_update?
|
27
29
|
|
28
30
|
self
|
29
31
|
end
|
30
32
|
|
33
|
+
# Check if the lazy update feature is enabled or not (default to false)
|
34
|
+
def lazy_update?
|
35
|
+
return false unless @config['algolia']
|
36
|
+
return true if @config['algolia']['lazy_update']
|
37
|
+
false
|
38
|
+
end
|
39
|
+
|
31
40
|
# Check if the specified file should be indexed (we exclude static files,
|
32
41
|
# robots.txt and custom defined exclusions).
|
33
42
|
def indexable?(file)
|
43
|
+
# Excluding all static assets (images, fonts, etc)
|
34
44
|
return false if file.is_a?(Jekyll::StaticFile)
|
35
45
|
|
36
|
-
|
37
|
-
|
38
|
-
|
39
|
-
# Keep only markdown and html files
|
46
|
+
# Jekyll auto-converts markdown to HTML, so if the file is neither
|
47
|
+
# markdown or HTML, we should probably not index it
|
40
48
|
allowed_extensions = %w(html)
|
41
49
|
if @config['markdown_ext']
|
42
50
|
allowed_extensions += @config['markdown_ext'].split(',')
|
43
51
|
end
|
44
|
-
|
45
|
-
|
46
|
-
end
|
47
|
-
return false unless allowed_extensions.include?(extname)
|
52
|
+
extname = File.extname(File.basename(file.path))
|
53
|
+
return false unless allowed_extensions.include?(extname[1..-1])
|
48
54
|
|
55
|
+
# We should not index GitHub pages 404 pages
|
56
|
+
# https://help.github.com/articles/creating-a-custom-404-page-for-your-github-pages-site/
|
57
|
+
basename_no_ext = File.basename(file.path, extname)
|
58
|
+
return false if basename_no_ext == '404'
|
59
|
+
|
60
|
+
# Users can also define their own blacklist and hooks to exclude files
|
49
61
|
return false if excluded_file?(file)
|
50
62
|
|
51
63
|
true
|
@@ -53,14 +65,17 @@ class AlgoliaSearchJekyllPush < Jekyll::Command
|
|
53
65
|
|
54
66
|
# Check if the file is in the list of excluded files
|
55
67
|
def excluded_file?(file)
|
68
|
+
# Blacklist of pages generated by Jekyll that we know should not be
|
69
|
+
# indexing
|
56
70
|
excluded = [
|
57
|
-
|
71
|
+
/^index\.html$/, # Index page
|
72
|
+
%r{^page([0-9]*)/index\.html} # Pagination pages
|
58
73
|
]
|
74
|
+
# User-provided blacklist
|
59
75
|
if @config['algolia']
|
60
76
|
excluded += (@config['algolia']['excluded_files'] || [])
|
61
77
|
end
|
62
78
|
|
63
|
-
# Exclude files explicitly excluded in _config
|
64
79
|
excluded.each do |pattern|
|
65
80
|
pattern = /#{Regexp.quote(pattern)}/ if pattern.is_a? String
|
66
81
|
return true if file.path =~ pattern
|
@@ -88,14 +103,17 @@ class AlgoliaSearchJekyllPush < Jekyll::Command
|
|
88
103
|
items = []
|
89
104
|
is_verbose = config['verbose']
|
90
105
|
each_site_file do |file|
|
106
|
+
# Skip files that should not be indexed
|
91
107
|
next unless AlgoliaSearchJekyllPush.indexable?(file)
|
92
108
|
Jekyll.logger.info "Extracting data from #{file.path}" if is_verbose
|
109
|
+
|
93
110
|
new_items = AlgoliaSearchRecordExtractor.new(file).extract
|
94
111
|
next if new_items.nil?
|
95
112
|
ap new_items if is_verbose
|
96
113
|
|
97
114
|
items += new_items
|
98
115
|
end
|
116
|
+
|
99
117
|
AlgoliaSearchJekyllPush.push(items)
|
100
118
|
end
|
101
119
|
|
@@ -178,14 +196,11 @@ class AlgoliaSearchJekyllPush < Jekyll::Command
|
|
178
196
|
end
|
179
197
|
end
|
180
198
|
|
181
|
-
|
182
|
-
|
183
|
-
|
184
|
-
|
185
|
-
Jekyll.logger.info '=== DRY RUN ===' if @is_dry_run
|
186
|
-
|
199
|
+
# Greedy update will push all the records to a temporary index, then
|
200
|
+
# override the existing index with this temp one
|
201
|
+
def greedy_update(items)
|
187
202
|
# Add items to a temp index, then rename it
|
188
|
-
index_name = checker.index_name
|
203
|
+
index_name = @checker.index_name
|
189
204
|
index_name_tmp = "#{index_name}_tmp"
|
190
205
|
batch_add_items(items, create_index(index_name_tmp))
|
191
206
|
Algolia.move_index(index_name_tmp, index_name) unless @is_dry_run
|
@@ -193,5 +208,52 @@ class AlgoliaSearchJekyllPush < Jekyll::Command
|
|
193
208
|
Jekyll.logger.info "Indexing of #{items.size} items " \
|
194
209
|
"in #{index_name} done."
|
195
210
|
end
|
211
|
+
|
212
|
+
# Lazy update will minimize the number of operations by only pushing new
|
213
|
+
# data and deleting old data
|
214
|
+
def lazy_update(items)
|
215
|
+
index = create_index(@checker.index_name)
|
216
|
+
remote = remote_ids(index)
|
217
|
+
local = items.map { |item| item[:objectID] }
|
218
|
+
|
219
|
+
delete_remote_not_in_local(index, local, remote)
|
220
|
+
|
221
|
+
add_local_not_in_remote(index, items, local, remote)
|
222
|
+
end
|
223
|
+
|
224
|
+
# Array of all objectID in the remote index
|
225
|
+
def remote_ids(index)
|
226
|
+
list = []
|
227
|
+
index.browse(attributesToRetrieve: 'objectID') do |hit|
|
228
|
+
list << hit['objectID']
|
229
|
+
end
|
230
|
+
list
|
231
|
+
end
|
232
|
+
|
233
|
+
# Delete all remote items that are no longer in the local items
|
234
|
+
def delete_remote_not_in_local(index, local, remote)
|
235
|
+
list = remote - local
|
236
|
+
Jekyll.logger.info "Deleting #{list.size} items"
|
237
|
+
index.delete_objects!(list) unless list.empty?
|
238
|
+
end
|
239
|
+
|
240
|
+
# Push all local items that are not yet in the index
|
241
|
+
def add_local_not_in_remote(index, items, local, remote)
|
242
|
+
list = local - remote
|
243
|
+
return Jekyll.logger.info "Adding #{list.size} items" if list.empty?
|
244
|
+
items_to_push = items.select do |item|
|
245
|
+
list.include?(item[:objectID])
|
246
|
+
end
|
247
|
+
batch_add_items(items_to_push, index)
|
248
|
+
end
|
249
|
+
|
250
|
+
def push(items)
|
251
|
+
checker = AlgoliaSearchCredentialChecker.new(@config)
|
252
|
+
checker.assert_valid
|
253
|
+
|
254
|
+
Jekyll.logger.info '=== DRY RUN ===' if @is_dry_run
|
255
|
+
|
256
|
+
@is_lazy_update ? lazy_update(items) : greedy_update(items)
|
257
|
+
end
|
196
258
|
end
|
197
259
|
end
|
data/lib/record_extractor.rb
CHANGED
@@ -1,6 +1,8 @@
|
|
1
1
|
require 'algoliasearch'
|
2
2
|
require 'nokogiri'
|
3
3
|
require 'json'
|
4
|
+
require 'html-hierarchy-extractor'
|
5
|
+
require_relative './utils'
|
4
6
|
|
5
7
|
# Given an HTML file as input, will return an array of records to index
|
6
8
|
class AlgoliaSearchRecordExtractor
|
@@ -25,189 +27,167 @@ class AlgoliaSearchRecordExtractor
|
|
25
27
|
items
|
26
28
|
end
|
27
29
|
|
28
|
-
|
29
|
-
|
30
|
-
|
31
|
-
|
30
|
+
##
|
31
|
+
# Return the type of the Jekyll element
|
32
|
+
# It can be either page, post or document
|
33
|
+
def type
|
34
|
+
classname = @file.class.name
|
35
|
+
subclass = classname.split('::')[1]
|
36
|
+
type = subclass.downcase
|
32
37
|
|
33
|
-
|
34
|
-
|
38
|
+
# In Jekyll v2, Page, Post and Document have their own class
|
39
|
+
return type if AlgoliaSearchUtils.restrict_jekyll_version(less_than: '3.0')
|
35
40
|
|
36
|
-
|
41
|
+
# In Jekyll v3, Post are actually a specific type of Documents
|
42
|
+
if type == 'document'
|
43
|
+
collection_name = @file.collection.label
|
44
|
+
return 'post' if collection_name == 'posts'
|
45
|
+
end
|
37
46
|
|
38
|
-
|
39
|
-
|
47
|
+
type
|
48
|
+
end
|
40
49
|
|
41
|
-
|
50
|
+
##
|
51
|
+
# Return the url of the page
|
52
|
+
def url
|
53
|
+
@file.url
|
42
54
|
end
|
43
55
|
|
56
|
+
##
|
57
|
+
# Return the title of the page
|
58
|
+
def title
|
59
|
+
@file.data['title']
|
60
|
+
end
|
61
|
+
|
62
|
+
##
|
44
63
|
# Returns the slug of the document
|
45
64
|
def slug
|
46
|
-
#
|
47
|
-
return @file.data['slug'] if @file.data.key?('slug')
|
48
|
-
# Old Jekyll v2 has it at the root
|
49
|
-
return @file.slug if @file.respond_to? :slug
|
50
|
-
# Otherwise, we guess it from the filename
|
65
|
+
# We can guess the slug from the filename for all documents
|
51
66
|
basename = File.basename(@file.path)
|
52
67
|
extname = File.extname(basename)
|
53
|
-
File.basename(basename, extname)
|
54
|
-
end
|
55
|
-
|
56
|
-
# Extract a list of tags
|
57
|
-
def tags
|
58
|
-
tags = nil
|
68
|
+
slug = File.basename(basename, extname)
|
59
69
|
|
60
|
-
# Jekyll v3
|
61
|
-
if @file.data.key?('
|
62
|
-
tags = @file.data['tags']
|
63
|
-
elsif @file.respond_to? :tags
|
64
|
-
tags = @file.tags
|
65
|
-
end
|
66
|
-
|
67
|
-
return tags if tags.nil?
|
70
|
+
# Jekyll v3 posts have it in data
|
71
|
+
return @file.data['slug'] if @file.data.key?('slug')
|
68
72
|
|
69
|
-
#
|
70
|
-
|
71
|
-
tags.map(&:to_s)
|
72
|
-
end
|
73
|
+
# Jekyll v2 posts have a specific slug method
|
74
|
+
return @file.slug if @file.respond_to?(:slug)
|
73
75
|
|
74
|
-
|
75
|
-
def html_nodes
|
76
|
-
document = Nokogiri::HTML(@file.content)
|
77
|
-
document.css(@config['record_css_selector'])
|
76
|
+
slug
|
78
77
|
end
|
79
78
|
|
80
|
-
|
81
|
-
|
82
|
-
|
83
|
-
|
79
|
+
##
|
80
|
+
# Get an array of tags of the document
|
81
|
+
def tags
|
82
|
+
tags = []
|
84
83
|
|
85
|
-
|
86
|
-
|
87
|
-
|
88
|
-
|
84
|
+
is_v2 = AlgoliaSearchUtils.restrict_jekyll_version(less_than: '3.0')
|
85
|
+
is_v3 = AlgoliaSearchUtils.restrict_jekyll_version(more_than: '3.0')
|
86
|
+
has_tags_method = @file.respond_to?(:tags)
|
87
|
+
has_tags_data = @file.data.key?('tags')
|
89
88
|
|
90
|
-
|
89
|
+
# Starting from Jekyll v3, all tags are in data['tags']
|
90
|
+
tags = @file.data['tags'] if is_v3 && has_tags_data
|
91
91
|
|
92
|
-
#
|
93
|
-
|
94
|
-
|
95
|
-
|
96
|
-
return nil if parent.name == 'body'
|
97
|
-
return node_heading_parent(parent, level)
|
92
|
+
# In Jekyll v2, tags are in data['tags'], or in .tags
|
93
|
+
if is_v2
|
94
|
+
tags = @file.tags if has_tags_method
|
95
|
+
tags = @file.data['tags'] if tags.empty? && has_tags_data
|
98
96
|
end
|
99
97
|
|
100
|
-
#
|
101
|
-
|
102
|
-
|
103
|
-
node_heading_parent(previous, level)
|
98
|
+
# Some extension extends the tags with custom classes, so we make sure we
|
99
|
+
# cast them as strings
|
100
|
+
tags.map(&:to_s)
|
104
101
|
end
|
105
102
|
|
106
|
-
|
107
|
-
#
|
108
|
-
def
|
109
|
-
|
110
|
-
level = tag_name.delete('h').to_i
|
103
|
+
##
|
104
|
+
# Get the post date timestamp
|
105
|
+
def date
|
106
|
+
return nil unless @file.respond_to?(:date)
|
111
107
|
|
112
|
-
|
113
|
-
state[tag_name.to_sym] = node_text(node)
|
114
|
-
state[:level] = level
|
115
|
-
end
|
116
|
-
|
117
|
-
heading = node_heading_parent(node)
|
118
|
-
|
119
|
-
# No previous heading, we can stop the recursion
|
120
|
-
unless heading
|
121
|
-
state.delete(:level)
|
122
|
-
return state
|
123
|
-
end
|
124
|
-
|
125
|
-
node_hierarchy(heading, state)
|
108
|
+
@file.date.to_time.to_i
|
126
109
|
end
|
127
110
|
|
128
|
-
|
129
|
-
|
130
|
-
|
131
|
-
|
111
|
+
##
|
112
|
+
# Get the collection name of a document
|
113
|
+
def collection
|
114
|
+
return nil unless @file.respond_to?(:collection)
|
132
115
|
|
133
|
-
|
134
|
-
def node_text(node)
|
135
|
-
node.content.gsub('<', '<').gsub('>', '>')
|
136
|
-
end
|
116
|
+
collection_name = @file.collection.label
|
137
117
|
|
138
|
-
|
139
|
-
|
140
|
-
|
141
|
-
headings.map { |heading| data[heading.to_sym] }.compact.join(' > ')
|
118
|
+
# In Jekyll v3, posts are actually a collection
|
119
|
+
return nil if collection_name == 'posts'
|
120
|
+
collection_name
|
142
121
|
end
|
143
122
|
|
144
|
-
|
145
|
-
#
|
146
|
-
def
|
147
|
-
|
123
|
+
##
|
124
|
+
# Get a hash of all front-matter data
|
125
|
+
def front_matter
|
126
|
+
raw_data = @file.data
|
148
127
|
|
149
|
-
#
|
150
|
-
|
151
|
-
|
152
|
-
|
153
|
-
|
154
|
-
end
|
128
|
+
# We clean some keys that will be handled by specific methods
|
129
|
+
attributes_to_remove = %w(title tags slug url date type)
|
130
|
+
attributes_to_remove.each do |attribute|
|
131
|
+
raw_data.delete(attribute)
|
132
|
+
end
|
155
133
|
|
156
|
-
|
157
|
-
|
158
|
-
|
159
|
-
|
160
|
-
|
161
|
-
.select { |title| data.key?(title) }
|
162
|
-
.map { |title| data[title].to_s.split(/\W+/) }
|
163
|
-
.flatten
|
164
|
-
.compact
|
165
|
-
.map(&:downcase)
|
166
|
-
.uniq
|
167
|
-
# Intersect words in headings with words in test
|
168
|
-
text_words = data[:text].downcase.split(/\W+/)
|
169
|
-
(title_words & text_words).size
|
170
|
-
end
|
134
|
+
# Convert to symbols
|
135
|
+
data = {}
|
136
|
+
raw_data.each do |key, value|
|
137
|
+
data[key.to_sym] = value
|
138
|
+
end
|
171
139
|
|
172
|
-
|
173
|
-
def weight_tag_name(item)
|
174
|
-
tag_name = item[:tag_name]
|
175
|
-
# No a heading, no weight
|
176
|
-
return 0 unless %w(h1 h2 h3 h4 h5 h6).include?(tag_name)
|
177
|
-
# h1: 100, h2: 90, ..., h6: 50
|
178
|
-
100 - (tag_name.delete('h').to_i - 1) * 10
|
140
|
+
data
|
179
141
|
end
|
180
142
|
|
181
|
-
|
182
|
-
|
183
|
-
|
184
|
-
|
185
|
-
|
186
|
-
position: index
|
143
|
+
##
|
144
|
+
# Get the list of all node data
|
145
|
+
def hierarchy_nodes
|
146
|
+
extractor_options = {
|
147
|
+
css_selector: @config['record_css_selector']
|
187
148
|
}
|
149
|
+
|
150
|
+
HTMLHierarchyExtractor.new(
|
151
|
+
@file.content,
|
152
|
+
options: extractor_options
|
153
|
+
).extract
|
188
154
|
end
|
189
155
|
|
156
|
+
# Extract all records from the page and return the list
|
190
157
|
def extract
|
158
|
+
# Getting all hierarchical nodes from the HTML input
|
159
|
+
raw_items = hierarchy_nodes
|
160
|
+
|
161
|
+
# Shared attributes relative to the page that all records will have
|
162
|
+
shared_attributes = {
|
163
|
+
type: type,
|
164
|
+
url: url,
|
165
|
+
title: title,
|
166
|
+
slug: slug,
|
167
|
+
date: date,
|
168
|
+
collection: collection,
|
169
|
+
tags: tags
|
170
|
+
}
|
171
|
+
# Remove empty attributes
|
172
|
+
shared_attributes = shared_attributes.delete_if do |_, value|
|
173
|
+
value.nil?
|
174
|
+
end
|
175
|
+
|
176
|
+
# Enriching with page metadata
|
191
177
|
items = []
|
192
|
-
|
193
|
-
|
194
|
-
|
195
|
-
item =
|
196
|
-
item
|
197
|
-
item
|
198
|
-
|
199
|
-
item
|
200
|
-
item[:unique_hierarchy] = unique_hierarchy(item)
|
201
|
-
item[:css_selector] = node_css_selector(node)
|
202
|
-
item[:css_selector_parent] = node_css_selector(node_heading_parent(node))
|
203
|
-
item[:weight] = weight(item, index)
|
204
|
-
|
205
|
-
# We pass item through the user defined custom hook
|
206
|
-
item = custom_hook_each(item, node)
|
178
|
+
raw_items.each do |raw_item|
|
179
|
+
nokogiri_node = raw_item[:node]
|
180
|
+
raw_item.delete(:node)
|
181
|
+
item = shared_attributes.merge(raw_item)
|
182
|
+
item[:objectID] = item[:uuid]
|
183
|
+
item.delete(:uuid)
|
184
|
+
|
185
|
+
item = custom_hook_each(item, nokogiri_node)
|
207
186
|
next if item.nil?
|
208
187
|
|
209
188
|
items << item
|
210
189
|
end
|
190
|
+
|
211
191
|
custom_hook_all(items)
|
212
192
|
end
|
213
193
|
end
|