jekyll-algolia 1.0.0 → 1.6.0

Sign up to get free protection for your applications and to get access to all the features.
Files changed (44) hide show
  1. checksums.yaml +4 -4
  2. data/CONTRIBUTING.md +51 -30
  3. data/README.md +69 -27
  4. data/lib/errors/invalid_credentials.txt +12 -0
  5. data/lib/errors/invalid_index_name.txt +9 -0
  6. data/lib/errors/missing_api_key.txt +15 -0
  7. data/lib/errors/missing_application_id.txt +11 -0
  8. data/lib/errors/missing_index_name.txt +18 -0
  9. data/lib/errors/no_records_found.txt +14 -0
  10. data/lib/errors/record_too_big.txt +27 -0
  11. data/lib/errors/record_too_big_api.txt +10 -0
  12. data/lib/errors/settings_manually_edited.txt +17 -0
  13. data/lib/errors/too_many_records.txt +14 -0
  14. data/lib/errors/unknown_application_id.txt +16 -0
  15. data/lib/errors/unknown_settings.txt +12 -0
  16. data/lib/jekyll-algolia.rb +45 -60
  17. data/lib/jekyll/algolia/configurator.rb +137 -44
  18. data/lib/jekyll/algolia/error_handler.rb +36 -48
  19. data/lib/jekyll/algolia/extractor.rb +16 -6
  20. data/lib/jekyll/algolia/file_browser.rb +161 -68
  21. data/lib/jekyll/algolia/hooks.rb +18 -6
  22. data/lib/jekyll/algolia/indexer.rb +283 -145
  23. data/lib/jekyll/algolia/logger.rb +39 -8
  24. data/lib/jekyll/algolia/overwrites/githubpages-configuration.rb +32 -0
  25. data/lib/jekyll/algolia/overwrites/jekyll-algolia-site.rb +151 -0
  26. data/lib/jekyll/algolia/overwrites/jekyll-document.rb +13 -0
  27. data/lib/jekyll/algolia/overwrites/jekyll-paginate-pager.rb +20 -0
  28. data/lib/jekyll/algolia/overwrites/jekyll-tags-link.rb +33 -0
  29. data/lib/jekyll/algolia/progress_bar.rb +27 -0
  30. data/lib/jekyll/algolia/shrinker.rb +112 -0
  31. data/lib/jekyll/algolia/utils.rb +118 -2
  32. data/lib/jekyll/algolia/version.rb +1 -1
  33. data/lib/jekyll/commands/algolia.rb +3 -14
  34. metadata +75 -31
  35. data/errors/invalid_credentials.txt +0 -10
  36. data/errors/invalid_credentials_for_tmp_index.txt +0 -17
  37. data/errors/invalid_index_name.txt +0 -11
  38. data/errors/missing_api_key.txt +0 -17
  39. data/errors/missing_application_id.txt +0 -12
  40. data/errors/missing_index_name.txt +0 -19
  41. data/errors/no_records_found.txt +0 -20
  42. data/errors/record_too_big.txt +0 -25
  43. data/errors/unknown_application_id.txt +0 -20
  44. data/errors/unknown_settings.txt +0 -15
@@ -11,13 +11,15 @@ module Jekyll
11
11
  # Public: Extract records from the file
12
12
  #
13
13
  # file - The Jekyll file to process
14
- # TOTEST
15
14
  def self.run(file)
16
- # Getting all hierarchical nodes from the HTML input
15
+ # Getting all nodes from the HTML input
17
16
  raw_records = extract_raw_records(file.content)
18
17
  # Getting file metadata
19
18
  shared_metadata = FileBrowser.metadata(file)
20
19
 
20
+ # If no content, we still index the metadata
21
+ raw_records = [shared_metadata] if raw_records.empty?
22
+
21
23
  # Building the list of records
22
24
  records = []
23
25
  raw_records.map do |record|
@@ -31,7 +33,7 @@ module Jekyll
31
33
  # Apply custom user-defined hooks
32
34
  # Users can return `nil` from the hook to signal we should not index
33
35
  # such a record
34
- record = Hooks.apply_each(record, node)
36
+ record = Hooks.apply_each(record, node, Jekyll::Algolia.site)
35
37
  next if record.nil?
36
38
 
37
39
  records << record
@@ -48,16 +50,24 @@ module Jekyll
48
50
  end
49
51
 
50
52
  # Public: Extract raw records from the file, including content for each
51
- # node to index and hierarchy
53
+ # node and its headings
52
54
  #
53
55
  # content - The HTML content to parse
54
56
  def self.extract_raw_records(content)
55
- AlgoliaHTMLExtractor.run(
57
+ records = AlgoliaHTMLExtractor.run(
56
58
  content,
57
59
  options: {
58
- css_selector: Configurator.algolia('nodes_to_index')
60
+ css_selector: Configurator.algolia('nodes_to_index'),
61
+ tags_to_exclude: 'script,style,iframe'
59
62
  }
60
63
  )
64
+ # We remove objectIDs, as the will be added at the very end, after all
65
+ # the hooks and shrinkage
66
+ records.each do |record|
67
+ record.delete(:objectID)
68
+ end
69
+
70
+ records
61
71
  end
62
72
  end
63
73
  end
@@ -1,6 +1,8 @@
1
1
  # frozen_string_literal: true
2
2
 
3
3
  require 'algolia_html_extractor'
4
+ require 'pathname'
5
+ require 'time'
4
6
 
5
7
  module Jekyll
6
8
  module Algolia
@@ -13,6 +15,50 @@ module Jekyll
13
15
  module FileBrowser
14
16
  include Jekyll::Algolia
15
17
 
18
+ # Public: Return the absolute path of a Jekyll file
19
+ #
20
+ # file - The Jekyll file to inspect
21
+ def self.absolute_path(filepath)
22
+ pathname = Pathname.new(filepath)
23
+ return pathname.cleanpath.to_s if pathname.absolute?
24
+
25
+ File.expand_path(File.join(Configurator.get('source'), filepath))
26
+ end
27
+
28
+ # Public: Return the path of a Jekyll file relative to the Jekyll source
29
+ #
30
+ # file - The Jekyll file to inspect
31
+ def self.relative_path(filepath)
32
+ pathname = Pathname.new(filepath)
33
+ config_source = Configurator.get('source') || ''
34
+ jekyll_source = Pathname.new(File.expand_path(config_source))
35
+
36
+ # Removing any starting ./
37
+ if pathname.relative?
38
+ fullpath = File.expand_path(File.join(jekyll_source, pathname))
39
+ return fullpath.gsub(%r{^#{jekyll_source}/}, '')
40
+ end
41
+
42
+ pathname.relative_path_from(jekyll_source).cleanpath.to_s
43
+ end
44
+
45
+ # Public: Check if the file should be indexed
46
+ #
47
+ # file - The Jekyll file
48
+ #
49
+ # There are many reasons a file should not be indexed. We need to exclude
50
+ # all the static assets, only keep the actual content.
51
+ def self.indexable?(file)
52
+ return false if static_file?(file)
53
+ return false if is_404?(file)
54
+ return false if redirect?(file)
55
+ return false unless allowed_extension?(file)
56
+ return false if excluded_from_config?(file)
57
+ return false if excluded_from_hook?(file)
58
+
59
+ true
60
+ end
61
+
16
62
  # Public: Check if the specified file is a static Jekyll asset
17
63
  #
18
64
  # file - The Jekyll file
@@ -30,20 +76,27 @@ module Jekyll
30
76
  # pages. We don't want to index those.
31
77
  # Source: https://help.github.com/articles/creating-a-custom-404-page-for-your-github-pages-site/
32
78
  #
33
- # rubocop:disable Naming/PredicateName
34
79
  def self.is_404?(file)
35
- File.basename(file.path, File.extname(file.path)) == '404'
80
+ ['404.md', '404.html'].include?(File.basename(file.path))
36
81
  end
37
- # rubocop:enable Naming/PredicateName
38
82
 
39
- # Public: Check if the page is a pagination page
83
+ # Public: Check if the file is redirect page
40
84
  #
41
85
  # file - The Jekyll file
42
86
  #
43
- # `jekyll-paginate` automatically creates pages to paginate through posts.
44
- # We don't want to index those
45
- def self.pagination_page?(file)
46
- Utils.match?(file.path, %r{page([0-9]*)/index\.html$})
87
+ # Plugins like jekyll-redirect-from add dynamic pages that only contain
88
+ # an HTML meta refresh. We need to exclude those files from indexing.
89
+ # https://github.com/jekyll/jekyll-redirect-from
90
+ def self.redirect?(file)
91
+ # When using redirect_from, jekyll-redirect-from creates a page named
92
+ # `redirect.html`
93
+ return true if file.respond_to?(:name) && file.name == 'redirect.html'
94
+ # When using redirect_to, it sets the layout to `redirect`
95
+ if file.respond_to?(:data) && file.data['layout'] == 'redirect'
96
+ return true
97
+ end
98
+
99
+ false
47
100
  end
48
101
 
49
102
  # Public: Check if the file has one of the allowed extensions
@@ -55,36 +108,24 @@ module Jekyll
55
108
  # and raw HTML files but this list can be extended using the
56
109
  # `extensions_to_index` config option.
57
110
  def self.allowed_extension?(file)
58
- extensions = Configurator.algolia('extensions_to_index')
111
+ extensions = Configurator.extensions_to_index
59
112
  extname = File.extname(file.path)[1..-1]
60
113
  extensions.include?(extname)
61
114
  end
62
115
 
63
- # Public: Check if the file has been excluded by the user
64
- #
65
- # file - The Jekyll file
66
- #
67
- # Files can be excluded either by setting the `files_to_exclude` option,
68
- # or by defining a custom hook
69
- def self.excluded_by_user?(file)
70
- excluded_from_config?(file) || excluded_from_hook?(file)
71
- end
72
-
73
116
  # Public: Check if the file has been excluded by `files_to_exclude`
74
117
  #
75
118
  # file - The Jekyll file
76
119
  def self.excluded_from_config?(file)
77
120
  excluded_patterns = Configurator.algolia('files_to_exclude')
78
- excluded_files = []
121
+ jekyll_source = Configurator.get('source')
122
+ path = absolute_path(file.path)
79
123
 
80
- # Transform the glob patterns into a real list of files
81
- Dir.chdir(Configurator.get('source')) do
82
- excluded_patterns.each do |pattern|
83
- excluded_files += Dir.glob(pattern)
84
- end
124
+ excluded_patterns.each do |pattern|
125
+ pattern = File.expand_path(File.join(jekyll_source, pattern))
126
+ return true if File.fnmatch(pattern, path, File::FNM_PATHNAME)
85
127
  end
86
-
87
- excluded_files.include?(file.path)
128
+ false
88
129
  end
89
130
 
90
131
  # Public: Check if the file has been excluded by running a custom user
@@ -95,34 +136,6 @@ module Jekyll
95
136
  Hooks.should_be_excluded?(file.path)
96
137
  end
97
138
 
98
- # Public: Return the path to the original file, relative from the Jekyll
99
- # source
100
- #
101
- # file - The Jekyll file
102
- #
103
- # Pages have their .path property relative to the source, but collections
104
- # (including posts) have an absolute file path.
105
- def self.path_from_root(file)
106
- source = Configurator.get('source')
107
- file.path.gsub(%r{^#{source}/}, '')
108
- end
109
-
110
- # Public: Check if the file should be indexed
111
- #
112
- # file - The Jekyll file
113
- #
114
- # There are many reasons a file should not be indexed. We need to exclude
115
- # all the static assets, only keep the actual content.
116
- def self.indexable?(file)
117
- return false if static_file?(file)
118
- return false if is_404?(file)
119
- return false if pagination_page?(file)
120
- return false unless allowed_extension?(file)
121
- return false if excluded_by_user?(file)
122
-
123
- true
124
- end
125
-
126
139
  # Public: Return a hash of all the file metadata
127
140
  #
128
141
  # file - The Jekyll file
@@ -134,6 +147,8 @@ module Jekyll
134
147
  raw_data = raw_data(file)
135
148
  specific_data = {
136
149
  collection: collection(file),
150
+ tags: tags(file),
151
+ categories: categories(file),
137
152
  date: date(file),
138
153
  excerpt_html: excerpt_html(file),
139
154
  excerpt_text: excerpt_text(file),
@@ -164,10 +179,16 @@ module Jekyll
164
179
  data.each_key do |key|
165
180
  data.delete(key) if respond_to?(key)
166
181
  end
167
-
168
- # Also delete keys we manually handle
169
182
  data.delete('excerpt')
170
183
 
184
+ # Delete other keys added by Jekyll that are not in the front-matter and
185
+ # not needed for search
186
+ data.delete('draft')
187
+ data.delete('ext')
188
+
189
+ # Convert all values to a version that can be serialized to JSON
190
+ data = Utils.jsonify(data)
191
+
171
192
  # Convert all keys to symbols
172
193
  data = Utils.keys_to_symbols(data)
173
194
 
@@ -196,29 +217,102 @@ module Jekyll
196
217
  file.url
197
218
  end
198
219
 
220
+ # Public: Returns the list of tags of a file, defaults to an empty array
221
+ #
222
+ # file - The Jekyll file
223
+ def self.tags(file)
224
+ file.data['tags'] || []
225
+ end
226
+
227
+ # Public: Returns the list of tags of a file, defaults to an empty array
228
+ #
229
+ # file - The Jekyll file
230
+ def self.categories(file)
231
+ file.data['categories'] || []
232
+ end
233
+
199
234
  # Public: Returns a timestamp of the file date
200
235
  #
201
236
  # file - The Jekyll file
202
237
  #
203
- # All collections have a date, either taken from the filename, or the
204
- # `date` config set in the front-matter. Even if none is set, the current
205
- # date is taken by default.
238
+ # Posts have their date coming from the filepath, or the front-matter.
239
+ # Pages and other collection items can only have a date set in
240
+ # front-matter.
206
241
  def self.date(file)
207
- date = file.data['date']
242
+ # Collections get their date from .date, while pages read it from .data.
243
+ # Jekyll by default will set the date of collection to the current date,
244
+ # but we monkey-patched that so it returns nil for collection items
245
+ date = if file.respond_to?(:date)
246
+ file.date
247
+ else
248
+ file.data['date']
249
+ end
250
+
208
251
  return nil if date.nil?
209
252
 
210
- date.to_i
253
+ # If date is a string, we try to parse it
254
+ if date.is_a? String
255
+ begin
256
+ date = Time.parse(date)
257
+ rescue StandardError
258
+ return nil
259
+ end
260
+ end
261
+
262
+ date.to_time.to_i
211
263
  end
212
264
 
213
- # Public: Returns the HTML version of the excerpt
265
+ # Public: Returns the raw excerpt of a file, directly as returned by
266
+ # Jekyll. Swallow any error that could occur when reading.
214
267
  #
215
268
  # file - The Jekyll file
216
269
  #
217
- # Only collections (including posts) have an excerpt. Pages don't.
270
+ # This might throw an exception if the excerpt is invalid. We also
271
+ # silence all logger output as Jekyll is quite verbose and will display
272
+ # the potential Liquid error in the terminal, even if we catch the actual
273
+ # error.
274
+ def self.excerpt_raw(file)
275
+ Logger.silent do
276
+ return file.data['excerpt'].to_s.strip
277
+ end
278
+ rescue StandardError
279
+ nil
280
+ end
281
+
282
+ # Public: Return true if the Jekyll default excerpt should be used for
283
+ # this file
284
+ #
285
+ # file - The Jekyll file
286
+ #
287
+ # Most of the time, we'll use our own excerpt (the first matching
288
+ # element), but in some cases, we'll fallback to Jekyll's default excerpt
289
+ # if it seems to be what the user wants
290
+ def self.use_default_excerpt?(file)
291
+ # Only posts can have excerpt
292
+ return false unless type(file) == 'post'
293
+
294
+ # User defined their own separator in the config
295
+ custom_separator = file.excerpt_separator.to_s.strip
296
+ return false if custom_separator.empty?
297
+
298
+ # This specific post contains this separator
299
+ file.content.include?(custom_separator)
300
+ end
301
+
302
+ # Public: Returns the HTML version of the excerpt
303
+ #
304
+ # file - The Jekyll file
218
305
  def self.excerpt_html(file)
219
- excerpt = file.data['excerpt']
220
- return nil if excerpt.nil?
221
- excerpt.to_s.tr("\n", ' ').strip
306
+ # If it's a post with a custom separator for the excerpt, we honor it
307
+ return excerpt_raw(file) if use_default_excerpt?(file)
308
+
309
+ # Otherwise we take the first matching node
310
+ html = file.content
311
+ selector = Configurator.algolia('nodes_to_index')
312
+ first_node = Nokogiri::HTML(html).css(selector).first
313
+ return nil if first_node.nil?
314
+
315
+ first_node.to_s
222
316
  end
223
317
 
224
318
  # Public: Returns the text version of the excerpt
@@ -228,7 +322,6 @@ module Jekyll
228
322
  # Only collections (including posts) have an excerpt. Pages don't.
229
323
  def self.excerpt_text(file)
230
324
  html = excerpt_html(file)
231
- return nil if html.nil?
232
325
  Utils.html_to_text(html)
233
326
  end
234
327
 
@@ -11,8 +11,15 @@ module Jekyll
11
11
  #
12
12
  # record - The hash of the record to be pushed
13
13
  # node - The Nokogiri node of the element
14
- def self.apply_each(record, node)
15
- before_indexing_each(record, node)
14
+ def self.apply_each(record, node, context)
15
+ case method(:before_indexing_each).arity
16
+ when 1
17
+ before_indexing_each(record)
18
+ when 2
19
+ before_indexing_each(record, node)
20
+ else
21
+ before_indexing_each(record, node, context)
22
+ end
16
23
  end
17
24
 
18
25
  # Public: Apply the before_indexing_all hook to all records.
@@ -21,8 +28,13 @@ module Jekyll
21
28
  # as they can be mocked in tests.
22
29
  #
23
30
  # records - The list of all records to be indexed
24
- def self.apply_all(records)
25
- before_indexing_all(records)
31
+ def self.apply_all(records, context)
32
+ case method(:before_indexing_all).arity
33
+ when 1
34
+ before_indexing_all(records)
35
+ else
36
+ before_indexing_all(records, context)
37
+ end
26
38
  end
27
39
 
28
40
  # Public: Check if the file should be indexed or not
@@ -47,7 +59,7 @@ module Jekyll
47
59
  # information from the HTML node.
48
60
  #
49
61
  # Users can return nil to signal that the record should not be indexed
50
- def self.before_indexing_each(record, _node)
62
+ def self.before_indexing_each(record, _node, _context)
51
63
  record
52
64
  end
53
65
 
@@ -59,7 +71,7 @@ module Jekyll
59
71
  # Users can modify the full list from here. It might provide an easier
60
72
  # interface than `hook_before_indexing_each` when knowing the full context
61
73
  # is necessary
62
- def self.before_indexing_all(records)
74
+ def self.before_indexing_all(records, _context)
63
75
  records
64
76
  end
65
77
  end
@@ -1,7 +1,10 @@
1
1
  # frozen_string_literal: true
2
2
 
3
3
  require 'algoliasearch'
4
+ require 'yaml'
5
+ require 'algolia_html_extractor'
4
6
 
7
+ # rubocop:disable Metrics/ModuleLength
5
8
  module Jekyll
6
9
  module Algolia
7
10
  # Module to push records to Algolia and configure the index
@@ -9,16 +12,60 @@ module Jekyll
9
12
  include Jekyll::Algolia
10
13
 
11
14
  # Public: Init the module
12
- #
13
- # This call will instanciate the Algolia API client, set the custom
14
- # User Agent and give an easy access to the main index
15
15
  def self.init
16
16
  ::Algolia.init(
17
17
  application_id: Configurator.application_id,
18
18
  api_key: Configurator.api_key
19
19
  )
20
+ index_name = Configurator.index_name
21
+ @index = ::Algolia::Index.new(index_name)
22
+ index_object_ids_name = Configurator.index_object_ids_name
23
+ @index_object_ids = ::Algolia::Index.new(index_object_ids_name)
20
24
 
21
25
  set_user_agent
26
+
27
+ self
28
+ end
29
+
30
+ # Public: Returns the Algolia index object
31
+ def self.index
32
+ @index
33
+ end
34
+
35
+ # Public: Returns the Algolia index used to store object ids
36
+ def self.index_object_ids
37
+ @index_object_ids
38
+ end
39
+
40
+ # Public: Check if an index exists
41
+ #
42
+ # index - Index to check
43
+ #
44
+ # Note: there is no API endpoint to do that, so we try to get the settings
45
+ # instead, which will fail if the index does not exist
46
+ def self.index_exist?(index)
47
+ index.get_settings
48
+ true
49
+ rescue StandardError
50
+ false
51
+ end
52
+
53
+ # Public: Get the number of records in an index
54
+ #
55
+ # index - Index to check
56
+ #
57
+ # Note: We'll do an empty query search, to match everything, but we'll
58
+ # only return the objectID and one element, to get the shortest response
59
+ # possible. It will still contain the nbHits
60
+ def self.record_count(index)
61
+ index.search(
62
+ '',
63
+ attributesToRetrieve: 'objectID',
64
+ distinct: false,
65
+ hitsPerPage: 1
66
+ )['nbHits']
67
+ rescue StandardError
68
+ 0
22
69
  end
23
70
 
24
71
  # Public: Set the User-Agent to send to the API
@@ -38,74 +85,75 @@ module Jekyll
38
85
  ::Algolia.set_extra_header('User-Agent', user_agent)
39
86
  end
40
87
 
41
- # Public: Returns an Algolia Index object from an index name
88
+ # Public: Get an array of all object IDs stored in the main index
42
89
  #
43
- # index_name - String name of the index
44
- def self.index(index_name)
45
- ::Algolia::Index.new(index_name)
46
- end
90
+ # Note: As this will be slow (grabbing them 1000 at a time), we display
91
+ # a progress bar.
92
+ def self.remote_object_ids_from_main_index
93
+ Logger.verbose("I:Inspecting existing records in index #{index.name}")
47
94
 
48
- # Public: Update records of the specified index
49
- #
50
- # index - Algolia Index to update
51
- # records - Array of records to update
52
- #
53
- # New records will be automatically added. Technically existing records
54
- # should be updated but this case should never happen as changing a record
55
- # content will change its objectID as well.
56
- #
57
- # Does nothing in dry run mode
58
- def self.update_records(index, records)
59
- batch_size = Configurator.algolia('indexing_batch_size')
60
- records.each_slice(batch_size) do |batch|
61
- Logger.log("I:Pushing #{batch.size} records")
62
- next if Configurator.dry_run?
63
- begin
64
- index.add_objects!(batch)
65
- rescue StandardError => error
66
- ErrorHandler.stop(error, records: records)
67
- end
68
- end
69
- end
70
-
71
- # Public: Delete records whose objectIDs are passed
72
- #
73
- # index - Algolia Index to target
74
- # ids - Array of objectIDs to delete
75
- #
76
- # Does nothing in dry run mode
77
- def self.delete_records_by_id(index, ids)
78
- return if ids.empty?
79
- Logger.log("I:Deleting #{ids.length} records")
80
- return if Configurator.dry_run?
95
+ list = []
81
96
 
97
+ # As it might take some time, we display a progress bar
98
+ progress_bar = ProgressBar.create(
99
+ total: record_count(index),
100
+ format: 'Inspecting existing records (%j%%) |%B|'
101
+ )
82
102
  begin
83
- index.delete_objects!(ids)
84
- rescue StandardError => error
85
- ErrorHandler.stop(error)
103
+ index.browse(
104
+ attributesToRetrieve: 'objectID',
105
+ hitsPerPage: 1000
106
+ ) do |hit|
107
+ list << hit['objectID']
108
+ progress_bar.increment
109
+ end
110
+ rescue StandardError
111
+ return []
86
112
  end
113
+
114
+ list.sort
87
115
  end
88
116
 
89
- # Public: Returns an array of all the objectIDs in the index
90
- #
91
- # index - Algolia Index to target
117
+ # Public: Get an array of all the object ids, stored in a dedicated
118
+ # index
92
119
  #
93
- # The returned array is sorted. It won't have any impact on the way it is
94
- # processed, but makes debugging easier when comparing arrays is needed.
95
- def self.remote_object_ids(index)
120
+ # Note: This will be very fast. Each record contain 100 object id, so it
121
+ # will fit in one call each time.
122
+ def self.remote_object_ids_from_dedicated_index
96
123
  list = []
97
124
  begin
98
- index.browse(attributesToRetrieve: 'objectID') do |hit|
99
- list << hit['objectID']
125
+ index_object_ids.browse(
126
+ attributesToRetrieve: 'content',
127
+ hitsPerPage: 1000
128
+ ) do |hit|
129
+ list += hit['content']
100
130
  end
101
131
  rescue StandardError
102
- # The index might not exist if it's the first time we use the plugin
103
- # so we'll consider that it means there are no records there
104
132
  return []
105
133
  end
134
+
106
135
  list.sort
107
136
  end
108
137
 
138
+ # Public: Returns an array of all the objectIDs in the index
139
+ #
140
+ # Note: We use a dedicated index to store the objectIDs for faster
141
+ # browsing, but if the index does not exist we read the main index.
142
+ def self.remote_object_ids
143
+ Logger.log('I:Getting list of existing records')
144
+
145
+ # Main index empty, the list is empty no matter what (we don't use the
146
+ # dedicated index in that case)
147
+ return [] if record_count(index).zero?
148
+
149
+ # Fast version, using the dedicated index
150
+ has_object_id_index = index_exist?(index_object_ids)
151
+ return remote_object_ids_from_dedicated_index if has_object_id_index
152
+
153
+ # Slow version, browsing the full index
154
+ remote_object_ids_from_main_index
155
+ end
156
+
109
157
  # Public: Returns an array of the local objectIDs
110
158
  #
111
159
  # records - Array of all local records
@@ -113,116 +161,211 @@ module Jekyll
113
161
  records.map { |record| record[:objectID] }.compact.sort
114
162
  end
115
163
 
116
- # Public: Update settings of the index
164
+ # Public: Update records of the index
117
165
  #
118
- # index - The Algolia Index
119
- # settings - The hash of settings to pass to the index
166
+ # records - All records extracted from Jekyll
120
167
  #
168
+ # Note: All operations will be done in one batch, assuring an atomic
169
+ # update
121
170
  # Does nothing in dry run mode
122
- def self.update_settings(index, settings)
123
- Logger.verbose('I:Updating settings')
124
- return if Configurator.dry_run?
125
- begin
126
- index.set_settings(settings)
127
- rescue StandardError => error
128
- ErrorHandler.stop(error, settings: settings)
171
+ def self.update_records(records)
172
+ # Getting list of objectID in remote and locally
173
+ remote_ids = remote_object_ids
174
+ local_ids = local_object_ids(records)
175
+
176
+ # Making a diff, to see what to add and what to delete
177
+ ids_to_delete = remote_ids - local_ids
178
+ ids_to_add = local_ids - remote_ids
179
+
180
+ # What changes should we do to the indexes?
181
+ has_records_to_update = !ids_to_delete.empty? || !ids_to_add.empty?
182
+ has_object_id_index = index_exist?(index_object_ids)
183
+
184
+ # Stop if nothing to change
185
+ if !has_records_to_update && has_object_id_index
186
+ Logger.log('I:Content is already up to date.')
187
+ return
188
+ end
189
+
190
+ # We group all operations into one batch
191
+ operations = []
192
+
193
+ # We update records only if there are records to update
194
+ if has_records_to_update
195
+ Logger.log("I:Updating records in index #{index.name}...")
196
+ Logger.log("I:Records to delete: #{ids_to_delete.length}")
197
+ Logger.log("I:Records to add: #{ids_to_add.length}")
198
+
199
+ # Transforming ids into real records to add
200
+ records_by_id = Hash[records.map { |r| [r[:objectID], r] }]
201
+ records_to_add = ids_to_add.map { |id| records_by_id[id] }
202
+
203
+ # Deletion operations come first, to avoid hitting an overquota too
204
+ # soon if it can be avoided
205
+ ids_to_delete.each do |object_id|
206
+ operations << {
207
+ action: 'deleteObject', indexName: index.name,
208
+ body: { objectID: object_id }
209
+ }
210
+ end
211
+ # Then we add the new records
212
+ operations += records_to_add.map do |new_record|
213
+ { action: 'addObject', indexName: index.name, body: new_record }
214
+ end
215
+ end
216
+
217
+ # We update the dedicated index everytime we update records, but we also
218
+ # create it if it does not exist
219
+ should_update_object_id_index = has_records_to_update ||
220
+ !has_object_id_index
221
+ if should_update_object_id_index
222
+ operations << { action: 'clear', indexName: index_object_ids.name }
223
+ local_ids.each_slice(100).each do |ids|
224
+ operations << {
225
+ action: 'addObject', indexName: index_object_ids.name,
226
+ body: { content: ids }
227
+ }
228
+ end
129
229
  end
230
+
231
+ execute_operations(operations)
130
232
  end
131
233
 
132
- # Public: Index content following the `diff` indexing mode
234
+ # Public: Execute a serie of operations in a batch
133
235
  #
134
- # records - Array of local records
236
+ # operations - Operations to batch
135
237
  #
136
- # The `diff` indexing mode will only push new content to the index and
137
- # remove old content from it. It won't touch records that haven't been
138
- # updated. It will be a bit slower as it will first need to get the list
139
- # of all records in the index, but it will consume less operations.
140
- def self.run_diff_mode(records)
141
- index = index(Configurator.index_name)
142
-
143
- # Update settings
144
- update_settings(index, Configurator.settings)
238
+ # Note: Will split the batch in several calls if too big, and will display
239
+ # a progress bar if this happens
240
+ def self.execute_operations(operations)
241
+ return if Configurator.dry_run?
242
+ return if operations.empty?
145
243
 
146
- # Getting list of objectID in remote and locally
147
- remote_ids = remote_object_ids(index)
148
- local_ids = local_object_ids(records)
244
+ # Run the batches in slices if they are too large
245
+ batch_size = Configurator.algolia('indexing_batch_size')
246
+ slices = operations.each_slice(batch_size).to_a
149
247
 
150
- old_records_ids = remote_ids - local_ids
151
- new_records_ids = local_ids - remote_ids
152
- if old_records_ids.empty? && new_records_ids.empty?
153
- Logger.log('I:Nothing to index. Your content is already up to date.')
154
- return
248
+ should_have_progress_bar = (slices.length > 1)
249
+ if should_have_progress_bar
250
+ progress_bar = ProgressBar.create(
251
+ total: slices.length,
252
+ format: 'Updating index (%j%%) |%B|'
253
+ )
155
254
  end
156
255
 
157
- Logger.log('I:Pushing records to Algolia...')
158
-
159
- # Delete remote records that are no longer available locally
160
- delete_records_by_id(index, old_records_ids)
256
+ slices.each do |slice|
257
+ begin
258
+ ::Algolia.batch!(slice)
161
259
 
162
- # Add only records that are not yet already in the remote
163
- new_records = records.select do |record|
164
- new_records_ids.include?(record[:objectID])
260
+ progress_bar.increment if should_have_progress_bar
261
+ rescue StandardError => e
262
+ ErrorHandler.stop(e, operations: slice)
263
+ end
165
264
  end
166
- update_records(index, new_records)
265
+ end
167
266
 
168
- Logger.log('I:✔ Indexing complete')
267
+ # Public: Get a unique settingID for the current settings
268
+ #
269
+ # The settingID is generated as a hash of the current settings. As it will
270
+ # be stored in the userData key of the resulting config, we exclude that
271
+ # key from the hashing.
272
+ def self.local_setting_id
273
+ settings = Configurator.settings
274
+ settings.delete('userData')
275
+ AlgoliaHTMLExtractor.uuid(settings)
169
276
  end
170
277
 
171
278
  # Public: Get the settings of the remote index
172
279
  #
173
- # index - The Algolia Index
174
- def self.remote_settings(index)
280
+ # In case the index is not accessible, it will return nil
281
+ def self.remote_settings
175
282
  index.get_settings
176
- rescue StandardError => error
177
- ErrorHandler.stop(error)
283
+ rescue StandardError
284
+ nil
178
285
  end
179
286
 
180
- # Public: Rename an index
287
+ # Public: Smart update of the settings of the index
181
288
  #
182
- # old_name - Current name of the index
183
- # new_name - New name of the index
289
+ # This will first compare the settings about to be pushed with the
290
+ # settings already pushed. It will compare userData.settingID for that.
291
+ # If the settingID is the same, we don't push as this won't change
292
+ # anything. We will still check if the remote config seem to have been
293
+ # manually altered though, and warn the user that this is not the
294
+ # preferred way of doing so.
184
295
  #
185
- # Does nothing in dry run mode
186
- def self.rename_index(old_name, new_name)
187
- Logger.verbose("I:Renaming `#{old_name}` to `#{new_name}`")
188
- return if Configurator.dry_run?
189
- begin
190
- ::Algolia.move_index(old_name, new_name)
191
- rescue StandardError => error
192
- ErrorHandler.stop(error, new_name: new_name)
193
- end
194
- end
296
+ # If the settingID are not matching, it means our config is different, so
297
+ # we push it, overriding the settingID for next push.
298
+ def self.update_settings
299
+ return if Configurator.settings.empty?
300
+
301
+ current_remote_settings = remote_settings || {}
302
+ remote_setting_id = current_remote_settings.dig('userData', 'settingID')
303
+
304
+ settings = Configurator.settings
305
+ setting_id = local_setting_id
306
+
307
+ are_settings_forced = Configurator.force_settings?
308
+
309
+ # The config we're about to push is the same we pushed previously. We
310
+ # won't push again.
311
+ if setting_id == remote_setting_id && !are_settings_forced
312
+ Logger.log('I:Settings are already up to date.')
313
+ # Check if remote config has been changed outside of the plugin, so we
314
+ # can warn users that they should not alter their config from outside
315
+ # of the plugin.
316
+ current_remote_settings.delete('userData')
317
+ changed_keys = Utils.diff_keys(settings, current_remote_settings)
318
+ unless changed_keys.nil?
319
+ warn_of_manual_dashboard_editing(changed_keys)
320
+ end
195
321
 
196
- # Public: Index content following the `atomic` indexing mode
197
- #
198
- # records - Array of records to push
199
- #
200
- # The `atomic` indexing mode will push all records to a brand new index,
201
- # configure it, and then overwrite the previous index with this new one.
202
- # For the end-user, it will make all the changes in one go, making sure
203
- # people are always searching into a fully configured index. It will
204
- # consume more operations, but will never leave the index in a transient
205
- # state.
206
- def self.run_atomic_mode(records)
207
- index_name = Configurator.index_name
208
- index = index(index_name)
209
- index_tmp_name = "#{Configurator.index_name}_tmp"
210
- index_tmp = index(index_tmp_name)
322
+ return
323
+ end
211
324
 
212
- Logger.verbose("I:Using `#{index_tmp_name}` as temporary index")
325
+ # Settings have changed, we push them
326
+ settings['userData'] = {
327
+ 'settingID' => setting_id,
328
+ 'pluginVersion' => VERSION
329
+ }
213
330
 
214
- # Copying original settings to the new index
215
- remote_settings = remote_settings(index)
216
- new_settings = remote_settings.merge(Configurator.settings)
217
- update_settings(index_tmp, new_settings)
331
+ Logger.log("I:Updating settings of index #{index.name}")
332
+ return if Configurator.dry_run?
218
333
 
219
- # Pushing everthing to a brand new index
220
- update_records(index_tmp, records)
334
+ set_settings(settings)
335
+ end
221
336
 
222
- # Renaming the new index in place of the old
223
- rename_index(index_tmp_name, index_name)
337
+ # Public: Set new settings to an index
338
+ #
339
+ # Will dispatch to the error handler if it fails
340
+ # rubocop:disable Naming/AccessorMethodName
341
+ def self.set_settings(settings)
342
+ index.set_settings!(settings)
343
+ rescue StandardError => e
344
+ ErrorHandler.stop(e, settings: settings)
345
+ end
346
+ # rubocop:enable Naming/AccessorMethodName
224
347
 
225
- Logger.log('I:✔ Indexing complete')
348
+ # Public: Warn users that they have some settings manually configured in
349
+ # their dashboard
350
+ #
351
+ # When users change some settings in their dashboard, those settings might
352
+ # get overwritten by the plugin. We can't prevent that, but we can warn
353
+ # them when we detect they changed something.
354
+ def self.warn_of_manual_dashboard_editing(changed_keys)
355
+ # Transform the hash into readable YAML
356
+ yaml_lines = changed_keys
357
+ .to_yaml(indentation: 2)
358
+ .split("\n")[1..-1]
359
+ yaml_lines.map! do |line|
360
+ line = line.gsub(/^ */) { |spaces| ' ' * spaces.length }
361
+ line = line.gsub('- ', '  - ')
362
+ "W:    #{line}"
363
+ end
364
+ Logger.known_message(
365
+ 'settings_manually_edited',
366
+ settings: yaml_lines.join("\n"),
367
+ index_name: Configurator.index_name
368
+ )
226
369
  end
227
370
 
228
371
  # Public: Push all records to Algolia and configure the index
@@ -231,10 +374,8 @@ module Jekyll
231
374
  def self.run(records)
232
375
  init
233
376
 
234
- record_count = records.length
235
-
236
377
  # Indexing zero record is surely a misconfiguration
237
- if record_count.zero?
378
+ if records.length.zero?
238
379
  files_to_exclude = Configurator.algolia('files_to_exclude').join(', ')
239
380
  Logger.known_message(
240
381
  'no_records_found',
@@ -244,15 +385,12 @@ module Jekyll
244
385
  exit 1
245
386
  end
246
387
 
247
- indexing_mode = Configurator.indexing_mode
248
- Logger.verbose("I:Indexing mode: #{indexing_mode}")
249
- case indexing_mode
250
- when 'diff'
251
- run_diff_mode(records)
252
- when 'atomic'
253
- run_atomic_mode(records)
254
- end
388
+ update_settings
389
+ update_records(records)
390
+
391
+ Logger.log('I:✔ Indexing complete')
255
392
  end
256
393
  end
257
394
  end
258
395
  end
396
+ # rubocop:enable Metrics/ModuleLength