jekyll-algolia 0.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,64 @@
1
+ # frozen_string_literal: true
2
+
3
+ require 'algolia_html_extractor'
4
+
5
+ module Jekyll
6
+ module Algolia
7
+ # Module to extract records from Jekyll files
8
+ module Extractor
9
+ include Jekyll::Algolia
10
+
11
+ # Public: Extract records from the file
12
+ #
13
+ # file - The Jekyll file to process
14
+ # TOTEST
15
+ def self.run(file)
16
+ # Getting all hierarchical nodes from the HTML input
17
+ raw_records = extract_raw_records(file.content)
18
+ # Getting file metadata
19
+ shared_metadata = FileBrowser.metadata(file)
20
+
21
+ # Building the list of records
22
+ records = []
23
+ raw_records.map do |record|
24
+ # We do not need to pass the HTML node element to the final record
25
+ node = record[:node]
26
+ record.delete(:node)
27
+
28
+ # Merging each record info with file info
29
+ record = Utils.compact_empty(record.merge(shared_metadata))
30
+
31
+ # Apply custom user-defined hooks
32
+ # Users can return `nil` from the hook to signal we should not index
33
+ # such a record
34
+ record = Hooks.apply_each(record, node)
35
+ next if record.nil?
36
+
37
+ records << record
38
+ end
39
+
40
+ records
41
+ end
42
+
43
+ # Public: Adds a unique :objectID field to the hash, representing the
44
+ # current content of the record
45
+ def self.add_unique_object_id(record)
46
+ record[:objectID] = AlgoliaHTMLExtractor.uuid(record)
47
+ record
48
+ end
49
+
50
+ # Public: Extract raw records from the file, including content for each
51
+ # node to index and hierarchy
52
+ #
53
+ # content - The HTML content to parse
54
+ def self.extract_raw_records(content)
55
+ AlgoliaHTMLExtractor.run(
56
+ content,
57
+ options: {
58
+ css_selector: Configurator.algolia('nodes_to_index')
59
+ }
60
+ )
61
+ end
62
+ end
63
+ end
64
+ end
@@ -0,0 +1,269 @@
1
+ # frozen_string_literal: true
2
+
3
+ require 'algolia_html_extractor'
4
+
5
+ module Jekyll
6
+ module Algolia
7
+ # Module to get information about Jekyll file. Jekyll handles posts, pages,
8
+ # collection, etc. They each need specific processing, so knowing which kind
9
+ # of file we're working on will help.
10
+ #
11
+ # We also do not index all files. This module will help in defining which
12
+ # files should be indexed and which should not.
13
+ module FileBrowser
14
+ include Jekyll::Algolia
15
+
16
+ # Public: Check if the specified file is a static Jekyll asset
17
+ #
18
+ # file - The Jekyll file
19
+ #
20
+ # We don't index static assets (js, css, images)
21
+ def self.static_file?(file)
22
+ file.is_a?(Jekyll::StaticFile)
23
+ end
24
+
25
+ # Public: Check if the file is a 404 error page
26
+ #
27
+ # file - The Jekyll file
28
+ #
29
+ # 404 pages are not Jekyll defaults but a convention adopted by GitHub
30
+ # pages. We don't want to index those.
31
+ # Source: https://help.github.com/articles/creating-a-custom-404-page-for-your-github-pages-site/
32
+ #
33
+ # rubocop:disable Naming/PredicateName
34
+ def self.is_404?(file)
35
+ File.basename(file.path, File.extname(file.path)) == '404'
36
+ end
37
+ # rubocop:enable Naming/PredicateName
38
+
39
+ # Public: Check if the page is a pagination page
40
+ #
41
+ # file - The Jekyll file
42
+ #
43
+ # `jekyll-paginate` automatically creates pages to paginate through posts.
44
+ # We don't want to index those
45
+ def self.pagination_page?(file)
46
+ Utils.match?(file.path, %r{page([0-9]*)/index\.html$})
47
+ end
48
+
49
+ # Public: Check if the file has one of the allowed extensions
50
+ #
51
+ # file - The Jekyll file
52
+ #
53
+ # Jekyll can transform markdown files to HTML by default. With plugins, it
54
+ # can convert many more file formats. By default we'll only index markdown
55
+ # and raw HTML files but this list can be extended using the
56
+ # `extensions_to_index` config option.
57
+ def self.allowed_extension?(file)
58
+ extensions = Configurator.algolia('extensions_to_index')
59
+ extname = File.extname(file.path)[1..-1]
60
+ extensions.include?(extname)
61
+ end
62
+
63
+ # Public: Check if the file has been excluded by the user
64
+ #
65
+ # file - The Jekyll file
66
+ #
67
+ # Files can be excluded either by setting the `files_to_exclude` option,
68
+ # or by defining a custom hook
69
+ def self.excluded_by_user?(file)
70
+ excluded_from_config?(file) || excluded_from_hook?(file)
71
+ end
72
+
73
+ # Public: Check if the file has been excluded by `files_to_exclude`
74
+ #
75
+ # file - The Jekyll file
76
+ def self.excluded_from_config?(file)
77
+ excluded_patterns = Configurator.algolia('files_to_exclude')
78
+ excluded_files = []
79
+
80
+ # Transform the glob patterns into a real list of files
81
+ Dir.chdir(Configurator.get('source')) do
82
+ excluded_patterns.each do |pattern|
83
+ excluded_files += Dir.glob(pattern)
84
+ end
85
+ end
86
+
87
+ excluded_files.include?(file.path)
88
+ end
89
+
90
+ # Public: Check if the file has been excluded by running a custom user
91
+ # hook
92
+ #
93
+ # file - The Jekyll file
94
+ def self.excluded_from_hook?(file)
95
+ Hooks.should_be_excluded?(file.path)
96
+ end
97
+
98
+ # Public: Return the path to the original file, relative from the Jekyll
99
+ # source
100
+ #
101
+ # file - The Jekyll file
102
+ #
103
+ # Pages have their .path property relative to the source, but collections
104
+ # (including posts) have an absolute file path.
105
+ def self.path_from_root(file)
106
+ source = Configurator.get('source')
107
+ file.path.gsub(%r{^#{source}/}, '')
108
+ end
109
+
110
+ # Public: Check if the file should be indexed
111
+ #
112
+ # file - The Jekyll file
113
+ #
114
+ # There are many reasons a file should not be indexed. We need to exclude
115
+ # all the static assets, only keep the actual content.
116
+ def self.indexable?(file)
117
+ return false if static_file?(file)
118
+ return false if is_404?(file)
119
+ return false if pagination_page?(file)
120
+ return false unless allowed_extension?(file)
121
+ return false if excluded_by_user?(file)
122
+
123
+ true
124
+ end
125
+
126
+ # Public: Return a hash of all the file metadata
127
+ #
128
+ # file - The Jekyll file
129
+ #
130
+ # It contains both the raw metadata extracted from the front-matter, as
131
+ # well as more specific fields like the collection name, date timestamp,
132
+ # slug, type and url
133
+ def self.metadata(file)
134
+ raw_data = raw_data(file)
135
+ specific_data = {
136
+ collection: collection(file),
137
+ date: date(file),
138
+ excerpt_html: excerpt_html(file),
139
+ excerpt_text: excerpt_text(file),
140
+ slug: slug(file),
141
+ type: type(file),
142
+ url: url(file)
143
+ }
144
+
145
+ metadata = Utils.compact_empty(raw_data.merge(specific_data))
146
+
147
+ metadata
148
+ end
149
+
150
+ # Public: Return a hash of all the raw data, as defined in the
151
+ # front-matter and including default values
152
+ #
153
+ # file - The Jekyll file
154
+ #
155
+ # Any custom data passed to the front-matter will be returned by this
156
+ # method. It ignores any key where we have a better, custom, getter.
157
+
158
+ # Note that even if you define tags and categories in a collection item,
159
+ # it will not be included in the data. It's always an empty array.
160
+ def self.raw_data(file)
161
+ data = file.data.clone
162
+
163
+ # Remove all keys where we have a specific getter
164
+ data.each_key do |key|
165
+ data.delete(key) if respond_to?(key)
166
+ end
167
+
168
+ # Also delete keys we manually handle
169
+ data.delete('excerpt')
170
+
171
+ # Convert all keys to symbols
172
+ data = Utils.keys_to_symbols(data)
173
+
174
+ data
175
+ end
176
+
177
+ # Public: Get the type of the document (page, post, collection, etc)
178
+ #
179
+ # file - The Jekyll file
180
+ #
181
+ # Pages are simple html and markdown documents in the tree
182
+ # Elements from a collection are called Documents
183
+ # Posts are a custom kind of Documents
184
+ def self.type(file)
185
+ type = file.class.name.split('::')[-1].downcase
186
+
187
+ type = 'post' if type == 'document' && file.collection.label == 'posts'
188
+
189
+ type
190
+ end
191
+
192
+ # Public: Returns the url of the file, starting from the root
193
+ #
194
+ # file - The Jekyll file
195
+ def self.url(file)
196
+ file.url
197
+ end
198
+
199
+ # Public: Returns a timestamp of the file date
200
+ #
201
+ # file - The Jekyll file
202
+ #
203
+ # All collections have a date, either taken from the filename, or the
204
+ # `date` config set in the front-matter. Even if none is set, the current
205
+ # date is taken by default.
206
+ def self.date(file)
207
+ date = file.data['date']
208
+ return nil if date.nil?
209
+
210
+ date.to_i
211
+ end
212
+
213
+ # Public: Returns the HTML version of the excerpt
214
+ #
215
+ # file - The Jekyll file
216
+ #
217
+ # Only collections (including posts) have an excerpt. Pages don't.
218
+ def self.excerpt_html(file)
219
+ excerpt = file.data['excerpt']
220
+ return nil if excerpt.nil?
221
+ excerpt.to_s.tr("\n", ' ').strip
222
+ end
223
+
224
+ # Public: Returns the text version of the excerpt
225
+ #
226
+ # file - The Jekyll file
227
+ #
228
+ # Only collections (including posts) have an excerpt. Pages don't.
229
+ def self.excerpt_text(file)
230
+ html = excerpt_html(file)
231
+ return nil if html.nil?
232
+ Utils.html_to_text(html)
233
+ end
234
+
235
+ # Public: Returns the slug of the file
236
+ #
237
+ # file - The Jekyll file
238
+ #
239
+ # Slugs can be automatically extracted from collections, but for other
240
+ # files, we have to create them from the basename
241
+ def self.slug(file)
242
+ # We get the real slug from the file data if available
243
+ return file.data['slug'] if file.data.key?('slug')
244
+
245
+ # We create it ourselves from the filepath otherwise
246
+ File.basename(file.path, File.extname(file.path)).downcase
247
+ end
248
+
249
+ # Public: Returns the name of the collection
250
+ #
251
+ # file - The Jekyll file
252
+ #
253
+ # Only collection documents can have a collection name. Pages don't. Posts
254
+ # are purposefully excluded from it as well even if they are technically
255
+ # part of a collection
256
+ def self.collection(file)
257
+ return nil unless file.respond_to?(:collection)
258
+
259
+ collection_name = file.collection.label
260
+
261
+ # Posts are a special kind of collection, but it's an implementation
262
+ # detail from my POV, so I'll exclude them
263
+ return nil if collection_name == 'posts'
264
+
265
+ collection_name
266
+ end
267
+ end
268
+ end
269
+ end
@@ -0,0 +1,67 @@
1
+ # frozen_string_literal: true
2
+
3
+ module Jekyll
4
+ module Algolia
5
+ # Applying user-defined hooks on the processing pipeline
6
+ module Hooks
7
+ # Public: Apply the before_indexing_each hook to the record.
8
+ # This method is a simple wrapper around methods that can be overwritten
9
+ # by users. Using a wrapper around it makes testing their behavior easier
10
+ # as they can be mocked in tests.
11
+ #
12
+ # record - The hash of the record to be pushed
13
+ # node - The Nokogiri node of the element
14
+ def self.apply_each(record, node)
15
+ before_indexing_each(record, node)
16
+ end
17
+
18
+ # Public: Apply the before_indexing_all hook to all records.
19
+ # This method is a simple wrapper around methods that can be overwritten
20
+ # by users. Using a wrapper around it makes testing their behavior easier
21
+ # as they can be mocked in tests.
22
+ #
23
+ # records - The list of all records to be indexed
24
+ def self.apply_all(records)
25
+ before_indexing_all(records)
26
+ end
27
+
28
+ # Public: Check if the file should be indexed or not
29
+ #
30
+ # filepath - The path to the file, before transformation
31
+ #
32
+ # This hook allow users to define if a specific file should be indexed or
33
+ # not. Basic exclusion can be done through the `files_to_exclude` option,
34
+ # but a custom hook like this one can allow more fine-grained
35
+ # customisation.
36
+ def self.should_be_excluded?(_filepath)
37
+ false
38
+ end
39
+
40
+ # Public: Custom method to be run on the record before indexing it
41
+ #
42
+ # record - The hash of the record to be pushed
43
+ # node - The Nokogiri node of the element
44
+ #
45
+ # Users can modify the record (adding/editing/removing keys) here. It can
46
+ # be used to remove keys that should not be indexed, or access more
47
+ # information from the HTML node.
48
+ #
49
+ # Users can return nil to signal that the record should not be indexed
50
+ def self.before_indexing_each(record, _node)
51
+ record
52
+ end
53
+
54
+ # Public: Custom method to be run on the list of all records before
55
+ # indexing them
56
+ #
57
+ # records - The list of all records to be indexed
58
+ #
59
+ # Users can modify the full list from here. It might provide an easier
60
+ # interface than `hook_before_indexing_each` when knowing the full context
61
+ # is necessary
62
+ def self.before_indexing_all(records)
63
+ records
64
+ end
65
+ end
66
+ end
67
+ end
@@ -0,0 +1,258 @@
1
+ # frozen_string_literal: true
2
+
3
+ require 'algoliasearch'
4
+
5
+ module Jekyll
6
+ module Algolia
7
+ # Module to push records to Algolia and configure the index
8
+ module Indexer
9
+ include Jekyll::Algolia
10
+
11
+ # Public: Init the module
12
+ #
13
+ # This call will instanciate the Algolia API client, set the custom
14
+ # User Agent and give an easy access to the main index
15
+ def self.init
16
+ ::Algolia.init(
17
+ application_id: Configurator.application_id,
18
+ api_key: Configurator.api_key
19
+ )
20
+
21
+ set_user_agent
22
+ end
23
+
24
+ # Public: Set the User-Agent to send to the API
25
+ #
26
+ # Every integrations should follow the "YYY Integration" pattern, and
27
+ # every API client should follow the "Algolia for YYY" pattern. Even if
28
+ # each integration version is pinned to a specific API client version, we
29
+ # are explicit in defining it to help debug from the dashboard.
30
+ def self.set_user_agent
31
+ user_agent = [
32
+ "Jekyll Integration (#{VERSION})",
33
+ "Algolia for Ruby (#{::Algolia::VERSION})",
34
+ "Jekyll (#{::Jekyll::VERSION})",
35
+ "Ruby (#{RUBY_VERSION})"
36
+ ].join('; ')
37
+
38
+ ::Algolia.set_extra_header('User-Agent', user_agent)
39
+ end
40
+
41
+ # Public: Returns an Algolia Index object from an index name
42
+ #
43
+ # index_name - String name of the index
44
+ def self.index(index_name)
45
+ ::Algolia::Index.new(index_name)
46
+ end
47
+
48
+ # Public: Update records of the specified index
49
+ #
50
+ # index - Algolia Index to update
51
+ # records - Array of records to update
52
+ #
53
+ # New records will be automatically added. Technically existing records
54
+ # should be updated but this case should never happen as changing a record
55
+ # content will change its objectID as well.
56
+ #
57
+ # Does nothing in dry run mode
58
+ def self.update_records(index, records)
59
+ batch_size = Configurator.algolia('indexing_batch_size')
60
+ records.each_slice(batch_size) do |batch|
61
+ Logger.log("I:Pushing #{batch.size} records")
62
+ next if Configurator.dry_run?
63
+ begin
64
+ index.add_objects!(batch)
65
+ rescue StandardError => error
66
+ ErrorHandler.stop(error, records: records)
67
+ end
68
+ end
69
+ end
70
+
71
+ # Public: Delete records whose objectIDs are passed
72
+ #
73
+ # index - Algolia Index to target
74
+ # ids - Array of objectIDs to delete
75
+ #
76
+ # Does nothing in dry run mode
77
+ def self.delete_records_by_id(index, ids)
78
+ return if ids.empty?
79
+ Logger.log("I:Deleting #{ids.length} records")
80
+ return if Configurator.dry_run?
81
+
82
+ begin
83
+ index.delete_objects!(ids)
84
+ rescue StandardError => error
85
+ ErrorHandler.stop(error)
86
+ end
87
+ end
88
+
89
+ # Public: Returns an array of all the objectIDs in the index
90
+ #
91
+ # index - Algolia Index to target
92
+ #
93
+ # The returned array is sorted. It won't have any impact on the way it is
94
+ # processed, but makes debugging easier when comparing arrays is needed.
95
+ def self.remote_object_ids(index)
96
+ list = []
97
+ begin
98
+ index.browse(attributesToRetrieve: 'objectID') do |hit|
99
+ list << hit['objectID']
100
+ end
101
+ rescue StandardError
102
+ # The index might not exist if it's the first time we use the plugin
103
+ # so we'll consider that it means there are no records there
104
+ return []
105
+ end
106
+ list.sort
107
+ end
108
+
109
+ # Public: Returns an array of the local objectIDs
110
+ #
111
+ # records - Array of all local records
112
+ def self.local_object_ids(records)
113
+ records.map { |record| record[:objectID] }.compact.sort
114
+ end
115
+
116
+ # Public: Update settings of the index
117
+ #
118
+ # index - The Algolia Index
119
+ # settings - The hash of settings to pass to the index
120
+ #
121
+ # Does nothing in dry run mode
122
+ def self.update_settings(index, settings)
123
+ Logger.verbose('I:Updating settings')
124
+ return if Configurator.dry_run?
125
+ begin
126
+ index.set_settings(settings)
127
+ rescue StandardError => error
128
+ ErrorHandler.stop(error, settings: settings)
129
+ end
130
+ end
131
+
132
+ # Public: Index content following the `diff` indexing mode
133
+ #
134
+ # records - Array of local records
135
+ #
136
+ # The `diff` indexing mode will only push new content to the index and
137
+ # remove old content from it. It won't touch records that haven't been
138
+ # updated. It will be a bit slower as it will first need to get the list
139
+ # of all records in the index, but it will consume less operations.
140
+ def self.run_diff_mode(records)
141
+ index = index(Configurator.index_name)
142
+
143
+ # Update settings
144
+ update_settings(index, Configurator.settings)
145
+
146
+ # Getting list of objectID in remote and locally
147
+ remote_ids = remote_object_ids(index)
148
+ local_ids = local_object_ids(records)
149
+
150
+ old_records_ids = remote_ids - local_ids
151
+ new_records_ids = local_ids - remote_ids
152
+ if old_records_ids.empty? && new_records_ids.empty?
153
+ Logger.log('I:Nothing to index. Your content is already up to date.')
154
+ return
155
+ end
156
+
157
+ Logger.log('I:Pushing records to Algolia...')
158
+
159
+ # Delete remote records that are no longer available locally
160
+ delete_records_by_id(index, old_records_ids)
161
+
162
+ # Add only records that are not yet already in the remote
163
+ new_records = records.select do |record|
164
+ new_records_ids.include?(record[:objectID])
165
+ end
166
+ update_records(index, new_records)
167
+
168
+ Logger.log('I:✔ Indexing complete')
169
+ end
170
+
171
+ # Public: Get the settings of the remote index
172
+ #
173
+ # index - The Algolia Index
174
+ def self.remote_settings(index)
175
+ index.get_settings
176
+ rescue StandardError => error
177
+ ErrorHandler.stop(error)
178
+ end
179
+
180
+ # Public: Rename an index
181
+ #
182
+ # old_name - Current name of the index
183
+ # new_name - New name of the index
184
+ #
185
+ # Does nothing in dry run mode
186
+ def self.rename_index(old_name, new_name)
187
+ Logger.verbose("I:Renaming `#{old_name}` to `#{new_name}`")
188
+ return if Configurator.dry_run?
189
+ begin
190
+ ::Algolia.move_index(old_name, new_name)
191
+ rescue StandardError => error
192
+ ErrorHandler.stop(error, new_name: new_name)
193
+ end
194
+ end
195
+
196
+ # Public: Index content following the `atomic` indexing mode
197
+ #
198
+ # records - Array of records to push
199
+ #
200
+ # The `atomic` indexing mode will push all records to a brand new index,
201
+ # configure it, and then overwrite the previous index with this new one.
202
+ # For the end-user, it will make all the changes in one go, making sure
203
+ # people are always searching into a fully configured index. It will
204
+ # consume more operations, but will never leave the index in a transient
205
+ # state.
206
+ def self.run_atomic_mode(records)
207
+ index_name = Configurator.index_name
208
+ index = index(index_name)
209
+ index_tmp_name = "#{Configurator.index_name}_tmp"
210
+ index_tmp = index(index_tmp_name)
211
+
212
+ Logger.verbose("I:Using `#{index_tmp_name}` as temporary index")
213
+
214
+ # Copying original settings to the new index
215
+ remote_settings = remote_settings(index)
216
+ new_settings = remote_settings.merge(Configurator.settings)
217
+ update_settings(index_tmp, new_settings)
218
+
219
+ # Pushing everthing to a brand new index
220
+ update_records(index_tmp, records)
221
+
222
+ # Renaming the new index in place of the old
223
+ rename_index(index_tmp_name, index_name)
224
+
225
+ Logger.log('I:✔ Indexing complete')
226
+ end
227
+
228
+ # Public: Push all records to Algolia and configure the index
229
+ #
230
+ # records - Records to push
231
+ def self.run(records)
232
+ init
233
+
234
+ record_count = records.length
235
+
236
+ # Indexing zero record is surely a misconfiguration
237
+ if record_count.zero?
238
+ files_to_exclude = Configurator.algolia('files_to_exclude').join(', ')
239
+ Logger.known_message(
240
+ 'no_records_found',
241
+ 'files_to_exclude' => files_to_exclude,
242
+ 'nodes_to_index' => Configurator.algolia('nodes_to_index')
243
+ )
244
+ exit 1
245
+ end
246
+
247
+ indexing_mode = Configurator.indexing_mode
248
+ Logger.verbose("I:Indexing mode: #{indexing_mode}")
249
+ case indexing_mode
250
+ when 'diff'
251
+ run_diff_mode(records)
252
+ when 'atomic'
253
+ run_atomic_mode(records)
254
+ end
255
+ end
256
+ end
257
+ end
258
+ end