jekyll-algolia 1.0.0 → 1.6.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (44) hide show
  1. checksums.yaml +4 -4
  2. data/CONTRIBUTING.md +51 -30
  3. data/README.md +69 -27
  4. data/lib/errors/invalid_credentials.txt +12 -0
  5. data/lib/errors/invalid_index_name.txt +9 -0
  6. data/lib/errors/missing_api_key.txt +15 -0
  7. data/lib/errors/missing_application_id.txt +11 -0
  8. data/lib/errors/missing_index_name.txt +18 -0
  9. data/lib/errors/no_records_found.txt +14 -0
  10. data/lib/errors/record_too_big.txt +27 -0
  11. data/lib/errors/record_too_big_api.txt +10 -0
  12. data/lib/errors/settings_manually_edited.txt +17 -0
  13. data/lib/errors/too_many_records.txt +14 -0
  14. data/lib/errors/unknown_application_id.txt +16 -0
  15. data/lib/errors/unknown_settings.txt +12 -0
  16. data/lib/jekyll-algolia.rb +45 -60
  17. data/lib/jekyll/algolia/configurator.rb +137 -44
  18. data/lib/jekyll/algolia/error_handler.rb +36 -48
  19. data/lib/jekyll/algolia/extractor.rb +16 -6
  20. data/lib/jekyll/algolia/file_browser.rb +161 -68
  21. data/lib/jekyll/algolia/hooks.rb +18 -6
  22. data/lib/jekyll/algolia/indexer.rb +283 -145
  23. data/lib/jekyll/algolia/logger.rb +39 -8
  24. data/lib/jekyll/algolia/overwrites/githubpages-configuration.rb +32 -0
  25. data/lib/jekyll/algolia/overwrites/jekyll-algolia-site.rb +151 -0
  26. data/lib/jekyll/algolia/overwrites/jekyll-document.rb +13 -0
  27. data/lib/jekyll/algolia/overwrites/jekyll-paginate-pager.rb +20 -0
  28. data/lib/jekyll/algolia/overwrites/jekyll-tags-link.rb +33 -0
  29. data/lib/jekyll/algolia/progress_bar.rb +27 -0
  30. data/lib/jekyll/algolia/shrinker.rb +112 -0
  31. data/lib/jekyll/algolia/utils.rb +118 -2
  32. data/lib/jekyll/algolia/version.rb +1 -1
  33. data/lib/jekyll/commands/algolia.rb +3 -14
  34. metadata +75 -31
  35. data/errors/invalid_credentials.txt +0 -10
  36. data/errors/invalid_credentials_for_tmp_index.txt +0 -17
  37. data/errors/invalid_index_name.txt +0 -11
  38. data/errors/missing_api_key.txt +0 -17
  39. data/errors/missing_application_id.txt +0 -12
  40. data/errors/missing_index_name.txt +0 -19
  41. data/errors/no_records_found.txt +0 -20
  42. data/errors/record_too_big.txt +0 -25
  43. data/errors/unknown_application_id.txt +0 -20
  44. data/errors/unknown_settings.txt +0 -15
@@ -11,13 +11,15 @@ module Jekyll
11
11
  # Public: Extract records from the file
12
12
  #
13
13
  # file - The Jekyll file to process
14
- # TOTEST
15
14
  def self.run(file)
16
- # Getting all hierarchical nodes from the HTML input
15
+ # Getting all nodes from the HTML input
17
16
  raw_records = extract_raw_records(file.content)
18
17
  # Getting file metadata
19
18
  shared_metadata = FileBrowser.metadata(file)
20
19
 
20
+ # If no content, we still index the metadata
21
+ raw_records = [shared_metadata] if raw_records.empty?
22
+
21
23
  # Building the list of records
22
24
  records = []
23
25
  raw_records.map do |record|
@@ -31,7 +33,7 @@ module Jekyll
31
33
  # Apply custom user-defined hooks
32
34
  # Users can return `nil` from the hook to signal we should not index
33
35
  # such a record
34
- record = Hooks.apply_each(record, node)
36
+ record = Hooks.apply_each(record, node, Jekyll::Algolia.site)
35
37
  next if record.nil?
36
38
 
37
39
  records << record
@@ -48,16 +50,24 @@ module Jekyll
48
50
  end
49
51
 
50
52
  # Public: Extract raw records from the file, including content for each
51
- # node to index and hierarchy
53
+ # node and its headings
52
54
  #
53
55
  # content - The HTML content to parse
54
56
  def self.extract_raw_records(content)
55
- AlgoliaHTMLExtractor.run(
57
+ records = AlgoliaHTMLExtractor.run(
56
58
  content,
57
59
  options: {
58
- css_selector: Configurator.algolia('nodes_to_index')
60
+ css_selector: Configurator.algolia('nodes_to_index'),
61
+ tags_to_exclude: 'script,style,iframe'
59
62
  }
60
63
  )
64
+ # We remove objectIDs, as the will be added at the very end, after all
65
+ # the hooks and shrinkage
66
+ records.each do |record|
67
+ record.delete(:objectID)
68
+ end
69
+
70
+ records
61
71
  end
62
72
  end
63
73
  end
@@ -1,6 +1,8 @@
1
1
  # frozen_string_literal: true
2
2
 
3
3
  require 'algolia_html_extractor'
4
+ require 'pathname'
5
+ require 'time'
4
6
 
5
7
  module Jekyll
6
8
  module Algolia
@@ -13,6 +15,50 @@ module Jekyll
13
15
  module FileBrowser
14
16
  include Jekyll::Algolia
15
17
 
18
+ # Public: Return the absolute path of a Jekyll file
19
+ #
20
+ # file - The Jekyll file to inspect
21
+ def self.absolute_path(filepath)
22
+ pathname = Pathname.new(filepath)
23
+ return pathname.cleanpath.to_s if pathname.absolute?
24
+
25
+ File.expand_path(File.join(Configurator.get('source'), filepath))
26
+ end
27
+
28
+ # Public: Return the path of a Jekyll file relative to the Jekyll source
29
+ #
30
+ # file - The Jekyll file to inspect
31
+ def self.relative_path(filepath)
32
+ pathname = Pathname.new(filepath)
33
+ config_source = Configurator.get('source') || ''
34
+ jekyll_source = Pathname.new(File.expand_path(config_source))
35
+
36
+ # Removing any starting ./
37
+ if pathname.relative?
38
+ fullpath = File.expand_path(File.join(jekyll_source, pathname))
39
+ return fullpath.gsub(%r{^#{jekyll_source}/}, '')
40
+ end
41
+
42
+ pathname.relative_path_from(jekyll_source).cleanpath.to_s
43
+ end
44
+
45
+ # Public: Check if the file should be indexed
46
+ #
47
+ # file - The Jekyll file
48
+ #
49
+ # There are many reasons a file should not be indexed. We need to exclude
50
+ # all the static assets, only keep the actual content.
51
+ def self.indexable?(file)
52
+ return false if static_file?(file)
53
+ return false if is_404?(file)
54
+ return false if redirect?(file)
55
+ return false unless allowed_extension?(file)
56
+ return false if excluded_from_config?(file)
57
+ return false if excluded_from_hook?(file)
58
+
59
+ true
60
+ end
61
+
16
62
  # Public: Check if the specified file is a static Jekyll asset
17
63
  #
18
64
  # file - The Jekyll file
@@ -30,20 +76,27 @@ module Jekyll
30
76
  # pages. We don't want to index those.
31
77
  # Source: https://help.github.com/articles/creating-a-custom-404-page-for-your-github-pages-site/
32
78
  #
33
- # rubocop:disable Naming/PredicateName
34
79
  def self.is_404?(file)
35
- File.basename(file.path, File.extname(file.path)) == '404'
80
+ ['404.md', '404.html'].include?(File.basename(file.path))
36
81
  end
37
- # rubocop:enable Naming/PredicateName
38
82
 
39
- # Public: Check if the page is a pagination page
83
+ # Public: Check if the file is redirect page
40
84
  #
41
85
  # file - The Jekyll file
42
86
  #
43
- # `jekyll-paginate` automatically creates pages to paginate through posts.
44
- # We don't want to index those
45
- def self.pagination_page?(file)
46
- Utils.match?(file.path, %r{page([0-9]*)/index\.html$})
87
+ # Plugins like jekyll-redirect-from add dynamic pages that only contain
88
+ # an HTML meta refresh. We need to exclude those files from indexing.
89
+ # https://github.com/jekyll/jekyll-redirect-from
90
+ def self.redirect?(file)
91
+ # When using redirect_from, jekyll-redirect-from creates a page named
92
+ # `redirect.html`
93
+ return true if file.respond_to?(:name) && file.name == 'redirect.html'
94
+ # When using redirect_to, it sets the layout to `redirect`
95
+ if file.respond_to?(:data) && file.data['layout'] == 'redirect'
96
+ return true
97
+ end
98
+
99
+ false
47
100
  end
48
101
 
49
102
  # Public: Check if the file has one of the allowed extensions
@@ -55,36 +108,24 @@ module Jekyll
55
108
  # and raw HTML files but this list can be extended using the
56
109
  # `extensions_to_index` config option.
57
110
  def self.allowed_extension?(file)
58
- extensions = Configurator.algolia('extensions_to_index')
111
+ extensions = Configurator.extensions_to_index
59
112
  extname = File.extname(file.path)[1..-1]
60
113
  extensions.include?(extname)
61
114
  end
62
115
 
63
- # Public: Check if the file has been excluded by the user
64
- #
65
- # file - The Jekyll file
66
- #
67
- # Files can be excluded either by setting the `files_to_exclude` option,
68
- # or by defining a custom hook
69
- def self.excluded_by_user?(file)
70
- excluded_from_config?(file) || excluded_from_hook?(file)
71
- end
72
-
73
116
  # Public: Check if the file has been excluded by `files_to_exclude`
74
117
  #
75
118
  # file - The Jekyll file
76
119
  def self.excluded_from_config?(file)
77
120
  excluded_patterns = Configurator.algolia('files_to_exclude')
78
- excluded_files = []
121
+ jekyll_source = Configurator.get('source')
122
+ path = absolute_path(file.path)
79
123
 
80
- # Transform the glob patterns into a real list of files
81
- Dir.chdir(Configurator.get('source')) do
82
- excluded_patterns.each do |pattern|
83
- excluded_files += Dir.glob(pattern)
84
- end
124
+ excluded_patterns.each do |pattern|
125
+ pattern = File.expand_path(File.join(jekyll_source, pattern))
126
+ return true if File.fnmatch(pattern, path, File::FNM_PATHNAME)
85
127
  end
86
-
87
- excluded_files.include?(file.path)
128
+ false
88
129
  end
89
130
 
90
131
  # Public: Check if the file has been excluded by running a custom user
@@ -95,34 +136,6 @@ module Jekyll
95
136
  Hooks.should_be_excluded?(file.path)
96
137
  end
97
138
 
98
- # Public: Return the path to the original file, relative from the Jekyll
99
- # source
100
- #
101
- # file - The Jekyll file
102
- #
103
- # Pages have their .path property relative to the source, but collections
104
- # (including posts) have an absolute file path.
105
- def self.path_from_root(file)
106
- source = Configurator.get('source')
107
- file.path.gsub(%r{^#{source}/}, '')
108
- end
109
-
110
- # Public: Check if the file should be indexed
111
- #
112
- # file - The Jekyll file
113
- #
114
- # There are many reasons a file should not be indexed. We need to exclude
115
- # all the static assets, only keep the actual content.
116
- def self.indexable?(file)
117
- return false if static_file?(file)
118
- return false if is_404?(file)
119
- return false if pagination_page?(file)
120
- return false unless allowed_extension?(file)
121
- return false if excluded_by_user?(file)
122
-
123
- true
124
- end
125
-
126
139
  # Public: Return a hash of all the file metadata
127
140
  #
128
141
  # file - The Jekyll file
@@ -134,6 +147,8 @@ module Jekyll
134
147
  raw_data = raw_data(file)
135
148
  specific_data = {
136
149
  collection: collection(file),
150
+ tags: tags(file),
151
+ categories: categories(file),
137
152
  date: date(file),
138
153
  excerpt_html: excerpt_html(file),
139
154
  excerpt_text: excerpt_text(file),
@@ -164,10 +179,16 @@ module Jekyll
164
179
  data.each_key do |key|
165
180
  data.delete(key) if respond_to?(key)
166
181
  end
167
-
168
- # Also delete keys we manually handle
169
182
  data.delete('excerpt')
170
183
 
184
+ # Delete other keys added by Jekyll that are not in the front-matter and
185
+ # not needed for search
186
+ data.delete('draft')
187
+ data.delete('ext')
188
+
189
+ # Convert all values to a version that can be serialized to JSON
190
+ data = Utils.jsonify(data)
191
+
171
192
  # Convert all keys to symbols
172
193
  data = Utils.keys_to_symbols(data)
173
194
 
@@ -196,29 +217,102 @@ module Jekyll
196
217
  file.url
197
218
  end
198
219
 
220
+ # Public: Returns the list of tags of a file, defaults to an empty array
221
+ #
222
+ # file - The Jekyll file
223
+ def self.tags(file)
224
+ file.data['tags'] || []
225
+ end
226
+
227
+ # Public: Returns the list of tags of a file, defaults to an empty array
228
+ #
229
+ # file - The Jekyll file
230
+ def self.categories(file)
231
+ file.data['categories'] || []
232
+ end
233
+
199
234
  # Public: Returns a timestamp of the file date
200
235
  #
201
236
  # file - The Jekyll file
202
237
  #
203
- # All collections have a date, either taken from the filename, or the
204
- # `date` config set in the front-matter. Even if none is set, the current
205
- # date is taken by default.
238
+ # Posts have their date coming from the filepath, or the front-matter.
239
+ # Pages and other collection items can only have a date set in
240
+ # front-matter.
206
241
  def self.date(file)
207
- date = file.data['date']
242
+ # Collections get their date from .date, while pages read it from .data.
243
+ # Jekyll by default will set the date of collection to the current date,
244
+ # but we monkey-patched that so it returns nil for collection items
245
+ date = if file.respond_to?(:date)
246
+ file.date
247
+ else
248
+ file.data['date']
249
+ end
250
+
208
251
  return nil if date.nil?
209
252
 
210
- date.to_i
253
+ # If date is a string, we try to parse it
254
+ if date.is_a? String
255
+ begin
256
+ date = Time.parse(date)
257
+ rescue StandardError
258
+ return nil
259
+ end
260
+ end
261
+
262
+ date.to_time.to_i
211
263
  end
212
264
 
213
- # Public: Returns the HTML version of the excerpt
265
+ # Public: Returns the raw excerpt of a file, directly as returned by
266
+ # Jekyll. Swallow any error that could occur when reading.
214
267
  #
215
268
  # file - The Jekyll file
216
269
  #
217
- # Only collections (including posts) have an excerpt. Pages don't.
270
+ # This might throw an exception if the excerpt is invalid. We also
271
+ # silence all logger output as Jekyll is quite verbose and will display
272
+ # the potential Liquid error in the terminal, even if we catch the actual
273
+ # error.
274
+ def self.excerpt_raw(file)
275
+ Logger.silent do
276
+ return file.data['excerpt'].to_s.strip
277
+ end
278
+ rescue StandardError
279
+ nil
280
+ end
281
+
282
+ # Public: Return true if the Jekyll default excerpt should be used for
283
+ # this file
284
+ #
285
+ # file - The Jekyll file
286
+ #
287
+ # Most of the time, we'll use our own excerpt (the first matching
288
+ # element), but in some cases, we'll fallback to Jekyll's default excerpt
289
+ # if it seems to be what the user wants
290
+ def self.use_default_excerpt?(file)
291
+ # Only posts can have excerpt
292
+ return false unless type(file) == 'post'
293
+
294
+ # User defined their own separator in the config
295
+ custom_separator = file.excerpt_separator.to_s.strip
296
+ return false if custom_separator.empty?
297
+
298
+ # This specific post contains this separator
299
+ file.content.include?(custom_separator)
300
+ end
301
+
302
+ # Public: Returns the HTML version of the excerpt
303
+ #
304
+ # file - The Jekyll file
218
305
  def self.excerpt_html(file)
219
- excerpt = file.data['excerpt']
220
- return nil if excerpt.nil?
221
- excerpt.to_s.tr("\n", ' ').strip
306
+ # If it's a post with a custom separator for the excerpt, we honor it
307
+ return excerpt_raw(file) if use_default_excerpt?(file)
308
+
309
+ # Otherwise we take the first matching node
310
+ html = file.content
311
+ selector = Configurator.algolia('nodes_to_index')
312
+ first_node = Nokogiri::HTML(html).css(selector).first
313
+ return nil if first_node.nil?
314
+
315
+ first_node.to_s
222
316
  end
223
317
 
224
318
  # Public: Returns the text version of the excerpt
@@ -228,7 +322,6 @@ module Jekyll
228
322
  # Only collections (including posts) have an excerpt. Pages don't.
229
323
  def self.excerpt_text(file)
230
324
  html = excerpt_html(file)
231
- return nil if html.nil?
232
325
  Utils.html_to_text(html)
233
326
  end
234
327
 
@@ -11,8 +11,15 @@ module Jekyll
11
11
  #
12
12
  # record - The hash of the record to be pushed
13
13
  # node - The Nokogiri node of the element
14
- def self.apply_each(record, node)
15
- before_indexing_each(record, node)
14
+ def self.apply_each(record, node, context)
15
+ case method(:before_indexing_each).arity
16
+ when 1
17
+ before_indexing_each(record)
18
+ when 2
19
+ before_indexing_each(record, node)
20
+ else
21
+ before_indexing_each(record, node, context)
22
+ end
16
23
  end
17
24
 
18
25
  # Public: Apply the before_indexing_all hook to all records.
@@ -21,8 +28,13 @@ module Jekyll
21
28
  # as they can be mocked in tests.
22
29
  #
23
30
  # records - The list of all records to be indexed
24
- def self.apply_all(records)
25
- before_indexing_all(records)
31
+ def self.apply_all(records, context)
32
+ case method(:before_indexing_all).arity
33
+ when 1
34
+ before_indexing_all(records)
35
+ else
36
+ before_indexing_all(records, context)
37
+ end
26
38
  end
27
39
 
28
40
  # Public: Check if the file should be indexed or not
@@ -47,7 +59,7 @@ module Jekyll
47
59
  # information from the HTML node.
48
60
  #
49
61
  # Users can return nil to signal that the record should not be indexed
50
- def self.before_indexing_each(record, _node)
62
+ def self.before_indexing_each(record, _node, _context)
51
63
  record
52
64
  end
53
65
 
@@ -59,7 +71,7 @@ module Jekyll
59
71
  # Users can modify the full list from here. It might provide an easier
60
72
  # interface than `hook_before_indexing_each` when knowing the full context
61
73
  # is necessary
62
- def self.before_indexing_all(records)
74
+ def self.before_indexing_all(records, _context)
63
75
  records
64
76
  end
65
77
  end
@@ -1,7 +1,10 @@
1
1
  # frozen_string_literal: true
2
2
 
3
3
  require 'algoliasearch'
4
+ require 'yaml'
5
+ require 'algolia_html_extractor'
4
6
 
7
+ # rubocop:disable Metrics/ModuleLength
5
8
  module Jekyll
6
9
  module Algolia
7
10
  # Module to push records to Algolia and configure the index
@@ -9,16 +12,60 @@ module Jekyll
9
12
  include Jekyll::Algolia
10
13
 
11
14
  # Public: Init the module
12
- #
13
- # This call will instanciate the Algolia API client, set the custom
14
- # User Agent and give an easy access to the main index
15
15
  def self.init
16
16
  ::Algolia.init(
17
17
  application_id: Configurator.application_id,
18
18
  api_key: Configurator.api_key
19
19
  )
20
+ index_name = Configurator.index_name
21
+ @index = ::Algolia::Index.new(index_name)
22
+ index_object_ids_name = Configurator.index_object_ids_name
23
+ @index_object_ids = ::Algolia::Index.new(index_object_ids_name)
20
24
 
21
25
  set_user_agent
26
+
27
+ self
28
+ end
29
+
30
+ # Public: Returns the Algolia index object
31
+ def self.index
32
+ @index
33
+ end
34
+
35
+ # Public: Returns the Algolia index used to store object ids
36
+ def self.index_object_ids
37
+ @index_object_ids
38
+ end
39
+
40
+ # Public: Check if an index exists
41
+ #
42
+ # index - Index to check
43
+ #
44
+ # Note: there is no API endpoint to do that, so we try to get the settings
45
+ # instead, which will fail if the index does not exist
46
+ def self.index_exist?(index)
47
+ index.get_settings
48
+ true
49
+ rescue StandardError
50
+ false
51
+ end
52
+
53
+ # Public: Get the number of records in an index
54
+ #
55
+ # index - Index to check
56
+ #
57
+ # Note: We'll do an empty query search, to match everything, but we'll
58
+ # only return the objectID and one element, to get the shortest response
59
+ # possible. It will still contain the nbHits
60
+ def self.record_count(index)
61
+ index.search(
62
+ '',
63
+ attributesToRetrieve: 'objectID',
64
+ distinct: false,
65
+ hitsPerPage: 1
66
+ )['nbHits']
67
+ rescue StandardError
68
+ 0
22
69
  end
23
70
 
24
71
  # Public: Set the User-Agent to send to the API
@@ -38,74 +85,75 @@ module Jekyll
38
85
  ::Algolia.set_extra_header('User-Agent', user_agent)
39
86
  end
40
87
 
41
- # Public: Returns an Algolia Index object from an index name
88
+ # Public: Get an array of all object IDs stored in the main index
42
89
  #
43
- # index_name - String name of the index
44
- def self.index(index_name)
45
- ::Algolia::Index.new(index_name)
46
- end
90
+ # Note: As this will be slow (grabbing them 1000 at a time), we display
91
+ # a progress bar.
92
+ def self.remote_object_ids_from_main_index
93
+ Logger.verbose("I:Inspecting existing records in index #{index.name}")
47
94
 
48
- # Public: Update records of the specified index
49
- #
50
- # index - Algolia Index to update
51
- # records - Array of records to update
52
- #
53
- # New records will be automatically added. Technically existing records
54
- # should be updated but this case should never happen as changing a record
55
- # content will change its objectID as well.
56
- #
57
- # Does nothing in dry run mode
58
- def self.update_records(index, records)
59
- batch_size = Configurator.algolia('indexing_batch_size')
60
- records.each_slice(batch_size) do |batch|
61
- Logger.log("I:Pushing #{batch.size} records")
62
- next if Configurator.dry_run?
63
- begin
64
- index.add_objects!(batch)
65
- rescue StandardError => error
66
- ErrorHandler.stop(error, records: records)
67
- end
68
- end
69
- end
70
-
71
- # Public: Delete records whose objectIDs are passed
72
- #
73
- # index - Algolia Index to target
74
- # ids - Array of objectIDs to delete
75
- #
76
- # Does nothing in dry run mode
77
- def self.delete_records_by_id(index, ids)
78
- return if ids.empty?
79
- Logger.log("I:Deleting #{ids.length} records")
80
- return if Configurator.dry_run?
95
+ list = []
81
96
 
97
+ # As it might take some time, we display a progress bar
98
+ progress_bar = ProgressBar.create(
99
+ total: record_count(index),
100
+ format: 'Inspecting existing records (%j%%) |%B|'
101
+ )
82
102
  begin
83
- index.delete_objects!(ids)
84
- rescue StandardError => error
85
- ErrorHandler.stop(error)
103
+ index.browse(
104
+ attributesToRetrieve: 'objectID',
105
+ hitsPerPage: 1000
106
+ ) do |hit|
107
+ list << hit['objectID']
108
+ progress_bar.increment
109
+ end
110
+ rescue StandardError
111
+ return []
86
112
  end
113
+
114
+ list.sort
87
115
  end
88
116
 
89
- # Public: Returns an array of all the objectIDs in the index
90
- #
91
- # index - Algolia Index to target
117
+ # Public: Get an array of all the object ids, stored in a dedicated
118
+ # index
92
119
  #
93
- # The returned array is sorted. It won't have any impact on the way it is
94
- # processed, but makes debugging easier when comparing arrays is needed.
95
- def self.remote_object_ids(index)
120
+ # Note: This will be very fast. Each record contain 100 object id, so it
121
+ # will fit in one call each time.
122
+ def self.remote_object_ids_from_dedicated_index
96
123
  list = []
97
124
  begin
98
- index.browse(attributesToRetrieve: 'objectID') do |hit|
99
- list << hit['objectID']
125
+ index_object_ids.browse(
126
+ attributesToRetrieve: 'content',
127
+ hitsPerPage: 1000
128
+ ) do |hit|
129
+ list += hit['content']
100
130
  end
101
131
  rescue StandardError
102
- # The index might not exist if it's the first time we use the plugin
103
- # so we'll consider that it means there are no records there
104
132
  return []
105
133
  end
134
+
106
135
  list.sort
107
136
  end
108
137
 
138
+ # Public: Returns an array of all the objectIDs in the index
139
+ #
140
+ # Note: We use a dedicated index to store the objectIDs for faster
141
+ # browsing, but if the index does not exist we read the main index.
142
+ def self.remote_object_ids
143
+ Logger.log('I:Getting list of existing records')
144
+
145
+ # Main index empty, the list is empty no matter what (we don't use the
146
+ # dedicated index in that case)
147
+ return [] if record_count(index).zero?
148
+
149
+ # Fast version, using the dedicated index
150
+ has_object_id_index = index_exist?(index_object_ids)
151
+ return remote_object_ids_from_dedicated_index if has_object_id_index
152
+
153
+ # Slow version, browsing the full index
154
+ remote_object_ids_from_main_index
155
+ end
156
+
109
157
  # Public: Returns an array of the local objectIDs
110
158
  #
111
159
  # records - Array of all local records
@@ -113,116 +161,211 @@ module Jekyll
113
161
  records.map { |record| record[:objectID] }.compact.sort
114
162
  end
115
163
 
116
- # Public: Update settings of the index
164
+ # Public: Update records of the index
117
165
  #
118
- # index - The Algolia Index
119
- # settings - The hash of settings to pass to the index
166
+ # records - All records extracted from Jekyll
120
167
  #
168
+ # Note: All operations will be done in one batch, assuring an atomic
169
+ # update
121
170
  # Does nothing in dry run mode
122
- def self.update_settings(index, settings)
123
- Logger.verbose('I:Updating settings')
124
- return if Configurator.dry_run?
125
- begin
126
- index.set_settings(settings)
127
- rescue StandardError => error
128
- ErrorHandler.stop(error, settings: settings)
171
+ def self.update_records(records)
172
+ # Getting list of objectID in remote and locally
173
+ remote_ids = remote_object_ids
174
+ local_ids = local_object_ids(records)
175
+
176
+ # Making a diff, to see what to add and what to delete
177
+ ids_to_delete = remote_ids - local_ids
178
+ ids_to_add = local_ids - remote_ids
179
+
180
+ # What changes should we do to the indexes?
181
+ has_records_to_update = !ids_to_delete.empty? || !ids_to_add.empty?
182
+ has_object_id_index = index_exist?(index_object_ids)
183
+
184
+ # Stop if nothing to change
185
+ if !has_records_to_update && has_object_id_index
186
+ Logger.log('I:Content is already up to date.')
187
+ return
188
+ end
189
+
190
+ # We group all operations into one batch
191
+ operations = []
192
+
193
+ # We update records only if there are records to update
194
+ if has_records_to_update
195
+ Logger.log("I:Updating records in index #{index.name}...")
196
+ Logger.log("I:Records to delete: #{ids_to_delete.length}")
197
+ Logger.log("I:Records to add: #{ids_to_add.length}")
198
+
199
+ # Transforming ids into real records to add
200
+ records_by_id = Hash[records.map { |r| [r[:objectID], r] }]
201
+ records_to_add = ids_to_add.map { |id| records_by_id[id] }
202
+
203
+ # Deletion operations come first, to avoid hitting an overquota too
204
+ # soon if it can be avoided
205
+ ids_to_delete.each do |object_id|
206
+ operations << {
207
+ action: 'deleteObject', indexName: index.name,
208
+ body: { objectID: object_id }
209
+ }
210
+ end
211
+ # Then we add the new records
212
+ operations += records_to_add.map do |new_record|
213
+ { action: 'addObject', indexName: index.name, body: new_record }
214
+ end
215
+ end
216
+
217
+ # We update the dedicated index everytime we update records, but we also
218
+ # create it if it does not exist
219
+ should_update_object_id_index = has_records_to_update ||
220
+ !has_object_id_index
221
+ if should_update_object_id_index
222
+ operations << { action: 'clear', indexName: index_object_ids.name }
223
+ local_ids.each_slice(100).each do |ids|
224
+ operations << {
225
+ action: 'addObject', indexName: index_object_ids.name,
226
+ body: { content: ids }
227
+ }
228
+ end
129
229
  end
230
+
231
+ execute_operations(operations)
130
232
  end
131
233
 
132
- # Public: Index content following the `diff` indexing mode
234
+ # Public: Execute a serie of operations in a batch
133
235
  #
134
- # records - Array of local records
236
+ # operations - Operations to batch
135
237
  #
136
- # The `diff` indexing mode will only push new content to the index and
137
- # remove old content from it. It won't touch records that haven't been
138
- # updated. It will be a bit slower as it will first need to get the list
139
- # of all records in the index, but it will consume less operations.
140
- def self.run_diff_mode(records)
141
- index = index(Configurator.index_name)
142
-
143
- # Update settings
144
- update_settings(index, Configurator.settings)
238
+ # Note: Will split the batch in several calls if too big, and will display
239
+ # a progress bar if this happens
240
+ def self.execute_operations(operations)
241
+ return if Configurator.dry_run?
242
+ return if operations.empty?
145
243
 
146
- # Getting list of objectID in remote and locally
147
- remote_ids = remote_object_ids(index)
148
- local_ids = local_object_ids(records)
244
+ # Run the batches in slices if they are too large
245
+ batch_size = Configurator.algolia('indexing_batch_size')
246
+ slices = operations.each_slice(batch_size).to_a
149
247
 
150
- old_records_ids = remote_ids - local_ids
151
- new_records_ids = local_ids - remote_ids
152
- if old_records_ids.empty? && new_records_ids.empty?
153
- Logger.log('I:Nothing to index. Your content is already up to date.')
154
- return
248
+ should_have_progress_bar = (slices.length > 1)
249
+ if should_have_progress_bar
250
+ progress_bar = ProgressBar.create(
251
+ total: slices.length,
252
+ format: 'Updating index (%j%%) |%B|'
253
+ )
155
254
  end
156
255
 
157
- Logger.log('I:Pushing records to Algolia...')
158
-
159
- # Delete remote records that are no longer available locally
160
- delete_records_by_id(index, old_records_ids)
256
+ slices.each do |slice|
257
+ begin
258
+ ::Algolia.batch!(slice)
161
259
 
162
- # Add only records that are not yet already in the remote
163
- new_records = records.select do |record|
164
- new_records_ids.include?(record[:objectID])
260
+ progress_bar.increment if should_have_progress_bar
261
+ rescue StandardError => e
262
+ ErrorHandler.stop(e, operations: slice)
263
+ end
165
264
  end
166
- update_records(index, new_records)
265
+ end
167
266
 
168
- Logger.log('I:✔ Indexing complete')
267
+ # Public: Get a unique settingID for the current settings
268
+ #
269
+ # The settingID is generated as a hash of the current settings. As it will
270
+ # be stored in the userData key of the resulting config, we exclude that
271
+ # key from the hashing.
272
+ def self.local_setting_id
273
+ settings = Configurator.settings
274
+ settings.delete('userData')
275
+ AlgoliaHTMLExtractor.uuid(settings)
169
276
  end
170
277
 
171
278
  # Public: Get the settings of the remote index
172
279
  #
173
- # index - The Algolia Index
174
- def self.remote_settings(index)
280
+ # In case the index is not accessible, it will return nil
281
+ def self.remote_settings
175
282
  index.get_settings
176
- rescue StandardError => error
177
- ErrorHandler.stop(error)
283
+ rescue StandardError
284
+ nil
178
285
  end
179
286
 
180
- # Public: Rename an index
287
+ # Public: Smart update of the settings of the index
181
288
  #
182
- # old_name - Current name of the index
183
- # new_name - New name of the index
289
+ # This will first compare the settings about to be pushed with the
290
+ # settings already pushed. It will compare userData.settingID for that.
291
+ # If the settingID is the same, we don't push as this won't change
292
+ # anything. We will still check if the remote config seem to have been
293
+ # manually altered though, and warn the user that this is not the
294
+ # preferred way of doing so.
184
295
  #
185
- # Does nothing in dry run mode
186
- def self.rename_index(old_name, new_name)
187
- Logger.verbose("I:Renaming `#{old_name}` to `#{new_name}`")
188
- return if Configurator.dry_run?
189
- begin
190
- ::Algolia.move_index(old_name, new_name)
191
- rescue StandardError => error
192
- ErrorHandler.stop(error, new_name: new_name)
193
- end
194
- end
296
+ # If the settingID are not matching, it means our config is different, so
297
+ # we push it, overriding the settingID for next push.
298
+ def self.update_settings
299
+ return if Configurator.settings.empty?
300
+
301
+ current_remote_settings = remote_settings || {}
302
+ remote_setting_id = current_remote_settings.dig('userData', 'settingID')
303
+
304
+ settings = Configurator.settings
305
+ setting_id = local_setting_id
306
+
307
+ are_settings_forced = Configurator.force_settings?
308
+
309
+ # The config we're about to push is the same we pushed previously. We
310
+ # won't push again.
311
+ if setting_id == remote_setting_id && !are_settings_forced
312
+ Logger.log('I:Settings are already up to date.')
313
+ # Check if remote config has been changed outside of the plugin, so we
314
+ # can warn users that they should not alter their config from outside
315
+ # of the plugin.
316
+ current_remote_settings.delete('userData')
317
+ changed_keys = Utils.diff_keys(settings, current_remote_settings)
318
+ unless changed_keys.nil?
319
+ warn_of_manual_dashboard_editing(changed_keys)
320
+ end
195
321
 
196
- # Public: Index content following the `atomic` indexing mode
197
- #
198
- # records - Array of records to push
199
- #
200
- # The `atomic` indexing mode will push all records to a brand new index,
201
- # configure it, and then overwrite the previous index with this new one.
202
- # For the end-user, it will make all the changes in one go, making sure
203
- # people are always searching into a fully configured index. It will
204
- # consume more operations, but will never leave the index in a transient
205
- # state.
206
- def self.run_atomic_mode(records)
207
- index_name = Configurator.index_name
208
- index = index(index_name)
209
- index_tmp_name = "#{Configurator.index_name}_tmp"
210
- index_tmp = index(index_tmp_name)
322
+ return
323
+ end
211
324
 
212
- Logger.verbose("I:Using `#{index_tmp_name}` as temporary index")
325
+ # Settings have changed, we push them
326
+ settings['userData'] = {
327
+ 'settingID' => setting_id,
328
+ 'pluginVersion' => VERSION
329
+ }
213
330
 
214
- # Copying original settings to the new index
215
- remote_settings = remote_settings(index)
216
- new_settings = remote_settings.merge(Configurator.settings)
217
- update_settings(index_tmp, new_settings)
331
+ Logger.log("I:Updating settings of index #{index.name}")
332
+ return if Configurator.dry_run?
218
333
 
219
- # Pushing everthing to a brand new index
220
- update_records(index_tmp, records)
334
+ set_settings(settings)
335
+ end
221
336
 
222
- # Renaming the new index in place of the old
223
- rename_index(index_tmp_name, index_name)
337
+ # Public: Set new settings to an index
338
+ #
339
+ # Will dispatch to the error handler if it fails
340
+ # rubocop:disable Naming/AccessorMethodName
341
+ def self.set_settings(settings)
342
+ index.set_settings!(settings)
343
+ rescue StandardError => e
344
+ ErrorHandler.stop(e, settings: settings)
345
+ end
346
+ # rubocop:enable Naming/AccessorMethodName
224
347
 
225
- Logger.log('I:✔ Indexing complete')
348
+ # Public: Warn users that they have some settings manually configured in
349
+ # their dashboard
350
+ #
351
+ # When users change some settings in their dashboard, those settings might
352
+ # get overwritten by the plugin. We can't prevent that, but we can warn
353
+ # them when we detect they changed something.
354
+ def self.warn_of_manual_dashboard_editing(changed_keys)
355
+ # Transform the hash into readable YAML
356
+ yaml_lines = changed_keys
357
+ .to_yaml(indentation: 2)
358
+ .split("\n")[1..-1]
359
+ yaml_lines.map! do |line|
360
+ line = line.gsub(/^ */) { |spaces| ' ' * spaces.length }
361
+ line = line.gsub('- ', '  - ')
362
+ "W:    #{line}"
363
+ end
364
+ Logger.known_message(
365
+ 'settings_manually_edited',
366
+ settings: yaml_lines.join("\n"),
367
+ index_name: Configurator.index_name
368
+ )
226
369
  end
227
370
 
228
371
  # Public: Push all records to Algolia and configure the index
@@ -231,10 +374,8 @@ module Jekyll
231
374
  def self.run(records)
232
375
  init
233
376
 
234
- record_count = records.length
235
-
236
377
  # Indexing zero record is surely a misconfiguration
237
- if record_count.zero?
378
+ if records.length.zero?
238
379
  files_to_exclude = Configurator.algolia('files_to_exclude').join(', ')
239
380
  Logger.known_message(
240
381
  'no_records_found',
@@ -244,15 +385,12 @@ module Jekyll
244
385
  exit 1
245
386
  end
246
387
 
247
- indexing_mode = Configurator.indexing_mode
248
- Logger.verbose("I:Indexing mode: #{indexing_mode}")
249
- case indexing_mode
250
- when 'diff'
251
- run_diff_mode(records)
252
- when 'atomic'
253
- run_atomic_mode(records)
254
- end
388
+ update_settings
389
+ update_records(records)
390
+
391
+ Logger.log('I:✔ Indexing complete')
255
392
  end
256
393
  end
257
394
  end
258
395
  end
396
+ # rubocop:enable Metrics/ModuleLength