jekyll-algolia 0.0.0

Sign up to get free protection for your applications and to get access to all the features.
@@ -0,0 +1,64 @@
1
+ # frozen_string_literal: true
2
+
3
+ require 'algolia_html_extractor'
4
+
5
+ module Jekyll
6
+ module Algolia
7
+ # Module to extract records from Jekyll files
8
+ module Extractor
9
+ include Jekyll::Algolia
10
+
11
+ # Public: Extract records from the file
12
+ #
13
+ # file - The Jekyll file to process
14
+ # TOTEST
15
+ def self.run(file)
16
+ # Getting all hierarchical nodes from the HTML input
17
+ raw_records = extract_raw_records(file.content)
18
+ # Getting file metadata
19
+ shared_metadata = FileBrowser.metadata(file)
20
+
21
+ # Building the list of records
22
+ records = []
23
+ raw_records.map do |record|
24
+ # We do not need to pass the HTML node element to the final record
25
+ node = record[:node]
26
+ record.delete(:node)
27
+
28
+ # Merging each record info with file info
29
+ record = Utils.compact_empty(record.merge(shared_metadata))
30
+
31
+ # Apply custom user-defined hooks
32
+ # Users can return `nil` from the hook to signal we should not index
33
+ # such a record
34
+ record = Hooks.apply_each(record, node)
35
+ next if record.nil?
36
+
37
+ records << record
38
+ end
39
+
40
+ records
41
+ end
42
+
43
+ # Public: Adds a unique :objectID field to the hash, representing the
44
+ # current content of the record
45
+ def self.add_unique_object_id(record)
46
+ record[:objectID] = AlgoliaHTMLExtractor.uuid(record)
47
+ record
48
+ end
49
+
50
+ # Public: Extract raw records from the file, including content for each
51
+ # node to index and hierarchy
52
+ #
53
+ # content - The HTML content to parse
54
+ def self.extract_raw_records(content)
55
+ AlgoliaHTMLExtractor.run(
56
+ content,
57
+ options: {
58
+ css_selector: Configurator.algolia('nodes_to_index')
59
+ }
60
+ )
61
+ end
62
+ end
63
+ end
64
+ end
@@ -0,0 +1,269 @@
1
+ # frozen_string_literal: true
2
+
3
+ require 'algolia_html_extractor'
4
+
5
+ module Jekyll
6
+ module Algolia
7
+ # Module to get information about Jekyll file. Jekyll handles posts, pages,
8
+ # collection, etc. They each need specific processing, so knowing which kind
9
+ # of file we're working on will help.
10
+ #
11
+ # We also do not index all files. This module will help in defining which
12
+ # files should be indexed and which should not.
13
+ module FileBrowser
14
+ include Jekyll::Algolia
15
+
16
+ # Public: Check if the specified file is a static Jekyll asset
17
+ #
18
+ # file - The Jekyll file
19
+ #
20
+ # We don't index static assets (js, css, images)
21
+ def self.static_file?(file)
22
+ file.is_a?(Jekyll::StaticFile)
23
+ end
24
+
25
+ # Public: Check if the file is a 404 error page
26
+ #
27
+ # file - The Jekyll file
28
+ #
29
+ # 404 pages are not Jekyll defaults but a convention adopted by GitHub
30
+ # pages. We don't want to index those.
31
+ # Source: https://help.github.com/articles/creating-a-custom-404-page-for-your-github-pages-site/
32
+ #
33
+ # rubocop:disable Naming/PredicateName
34
+ def self.is_404?(file)
35
+ File.basename(file.path, File.extname(file.path)) == '404'
36
+ end
37
+ # rubocop:enable Naming/PredicateName
38
+
39
+ # Public: Check if the page is a pagination page
40
+ #
41
+ # file - The Jekyll file
42
+ #
43
+ # `jekyll-paginate` automatically creates pages to paginate through posts.
44
+ # We don't want to index those
45
+ def self.pagination_page?(file)
46
+ Utils.match?(file.path, %r{page([0-9]*)/index\.html$})
47
+ end
48
+
49
+ # Public: Check if the file has one of the allowed extensions
50
+ #
51
+ # file - The Jekyll file
52
+ #
53
+ # Jekyll can transform markdown files to HTML by default. With plugins, it
54
+ # can convert many more file formats. By default we'll only index markdown
55
+ # and raw HTML files but this list can be extended using the
56
+ # `extensions_to_index` config option.
57
+ def self.allowed_extension?(file)
58
+ extensions = Configurator.algolia('extensions_to_index')
59
+ extname = File.extname(file.path)[1..-1]
60
+ extensions.include?(extname)
61
+ end
62
+
63
+ # Public: Check if the file has been excluded by the user
64
+ #
65
+ # file - The Jekyll file
66
+ #
67
+ # Files can be excluded either by setting the `files_to_exclude` option,
68
+ # or by defining a custom hook
69
+ def self.excluded_by_user?(file)
70
+ excluded_from_config?(file) || excluded_from_hook?(file)
71
+ end
72
+
73
+ # Public: Check if the file has been excluded by `files_to_exclude`
74
+ #
75
+ # file - The Jekyll file
76
+ def self.excluded_from_config?(file)
77
+ excluded_patterns = Configurator.algolia('files_to_exclude')
78
+ excluded_files = []
79
+
80
+ # Transform the glob patterns into a real list of files
81
+ Dir.chdir(Configurator.get('source')) do
82
+ excluded_patterns.each do |pattern|
83
+ excluded_files += Dir.glob(pattern)
84
+ end
85
+ end
86
+
87
+ excluded_files.include?(file.path)
88
+ end
89
+
90
+ # Public: Check if the file has been excluded by running a custom user
91
+ # hook
92
+ #
93
+ # file - The Jekyll file
94
+ def self.excluded_from_hook?(file)
95
+ Hooks.should_be_excluded?(file.path)
96
+ end
97
+
98
+ # Public: Return the path to the original file, relative from the Jekyll
99
+ # source
100
+ #
101
+ # file - The Jekyll file
102
+ #
103
+ # Pages have their .path property relative to the source, but collections
104
+ # (including posts) have an absolute file path.
105
+ def self.path_from_root(file)
106
+ source = Configurator.get('source')
107
+ file.path.gsub(%r{^#{source}/}, '')
108
+ end
109
+
110
+ # Public: Check if the file should be indexed
111
+ #
112
+ # file - The Jekyll file
113
+ #
114
+ # There are many reasons a file should not be indexed. We need to exclude
115
+ # all the static assets, only keep the actual content.
116
+ def self.indexable?(file)
117
+ return false if static_file?(file)
118
+ return false if is_404?(file)
119
+ return false if pagination_page?(file)
120
+ return false unless allowed_extension?(file)
121
+ return false if excluded_by_user?(file)
122
+
123
+ true
124
+ end
125
+
126
+ # Public: Return a hash of all the file metadata
127
+ #
128
+ # file - The Jekyll file
129
+ #
130
+ # It contains both the raw metadata extracted from the front-matter, as
131
+ # well as more specific fields like the collection name, date timestamp,
132
+ # slug, type and url
133
+ def self.metadata(file)
134
+ raw_data = raw_data(file)
135
+ specific_data = {
136
+ collection: collection(file),
137
+ date: date(file),
138
+ excerpt_html: excerpt_html(file),
139
+ excerpt_text: excerpt_text(file),
140
+ slug: slug(file),
141
+ type: type(file),
142
+ url: url(file)
143
+ }
144
+
145
+ metadata = Utils.compact_empty(raw_data.merge(specific_data))
146
+
147
+ metadata
148
+ end
149
+
150
+ # Public: Return a hash of all the raw data, as defined in the
151
+ # front-matter and including default values
152
+ #
153
+ # file - The Jekyll file
154
+ #
155
+ # Any custom data passed to the front-matter will be returned by this
156
+ # method. It ignores any key where we have a better, custom, getter.
157
+
158
+ # Note that even if you define tags and categories in a collection item,
159
+ # it will not be included in the data. It's always an empty array.
160
+ def self.raw_data(file)
161
+ data = file.data.clone
162
+
163
+ # Remove all keys where we have a specific getter
164
+ data.each_key do |key|
165
+ data.delete(key) if respond_to?(key)
166
+ end
167
+
168
+ # Also delete keys we manually handle
169
+ data.delete('excerpt')
170
+
171
+ # Convert all keys to symbols
172
+ data = Utils.keys_to_symbols(data)
173
+
174
+ data
175
+ end
176
+
177
+ # Public: Get the type of the document (page, post, collection, etc)
178
+ #
179
+ # file - The Jekyll file
180
+ #
181
+ # Pages are simple html and markdown documents in the tree
182
+ # Elements from a collection are called Documents
183
+ # Posts are a custom kind of Documents
184
+ def self.type(file)
185
+ type = file.class.name.split('::')[-1].downcase
186
+
187
+ type = 'post' if type == 'document' && file.collection.label == 'posts'
188
+
189
+ type
190
+ end
191
+
192
+ # Public: Returns the url of the file, starting from the root
193
+ #
194
+ # file - The Jekyll file
195
+ def self.url(file)
196
+ file.url
197
+ end
198
+
199
+ # Public: Returns a timestamp of the file date
200
+ #
201
+ # file - The Jekyll file
202
+ #
203
+ # All collections have a date, either taken from the filename, or the
204
+ # `date` config set in the front-matter. Even if none is set, the current
205
+ # date is taken by default.
206
+ def self.date(file)
207
+ date = file.data['date']
208
+ return nil if date.nil?
209
+
210
+ date.to_i
211
+ end
212
+
213
+ # Public: Returns the HTML version of the excerpt
214
+ #
215
+ # file - The Jekyll file
216
+ #
217
+ # Only collections (including posts) have an excerpt. Pages don't.
218
+ def self.excerpt_html(file)
219
+ excerpt = file.data['excerpt']
220
+ return nil if excerpt.nil?
221
+ excerpt.to_s.tr("\n", ' ').strip
222
+ end
223
+
224
+ # Public: Returns the text version of the excerpt
225
+ #
226
+ # file - The Jekyll file
227
+ #
228
+ # Only collections (including posts) have an excerpt. Pages don't.
229
+ def self.excerpt_text(file)
230
+ html = excerpt_html(file)
231
+ return nil if html.nil?
232
+ Utils.html_to_text(html)
233
+ end
234
+
235
+ # Public: Returns the slug of the file
236
+ #
237
+ # file - The Jekyll file
238
+ #
239
+ # Slugs can be automatically extracted from collections, but for other
240
+ # files, we have to create them from the basename
241
+ def self.slug(file)
242
+ # We get the real slug from the file data if available
243
+ return file.data['slug'] if file.data.key?('slug')
244
+
245
+ # We create it ourselves from the filepath otherwise
246
+ File.basename(file.path, File.extname(file.path)).downcase
247
+ end
248
+
249
+ # Public: Returns the name of the collection
250
+ #
251
+ # file - The Jekyll file
252
+ #
253
+ # Only collection documents can have a collection name. Pages don't. Posts
254
+ # are purposefully excluded from it as well even if they are technically
255
+ # part of a collection
256
+ def self.collection(file)
257
+ return nil unless file.respond_to?(:collection)
258
+
259
+ collection_name = file.collection.label
260
+
261
+ # Posts are a special kind of collection, but it's an implementation
262
+ # detail from my POV, so I'll exclude them
263
+ return nil if collection_name == 'posts'
264
+
265
+ collection_name
266
+ end
267
+ end
268
+ end
269
+ end
@@ -0,0 +1,67 @@
1
+ # frozen_string_literal: true
2
+
3
+ module Jekyll
4
+ module Algolia
5
+ # Applying user-defined hooks on the processing pipeline
6
+ module Hooks
7
+ # Public: Apply the before_indexing_each hook to the record.
8
+ # This method is a simple wrapper around methods that can be overwritten
9
+ # by users. Using a wrapper around it makes testing their behavior easier
10
+ # as they can be mocked in tests.
11
+ #
12
+ # record - The hash of the record to be pushed
13
+ # node - The Nokogiri node of the element
14
+ def self.apply_each(record, node)
15
+ before_indexing_each(record, node)
16
+ end
17
+
18
+ # Public: Apply the before_indexing_all hook to all records.
19
+ # This method is a simple wrapper around methods that can be overwritten
20
+ # by users. Using a wrapper around it makes testing their behavior easier
21
+ # as they can be mocked in tests.
22
+ #
23
+ # records - The list of all records to be indexed
24
+ def self.apply_all(records)
25
+ before_indexing_all(records)
26
+ end
27
+
28
+ # Public: Check if the file should be indexed or not
29
+ #
30
+ # filepath - The path to the file, before transformation
31
+ #
32
+ # This hook allow users to define if a specific file should be indexed or
33
+ # not. Basic exclusion can be done through the `files_to_exclude` option,
34
+ # but a custom hook like this one can allow more fine-grained
35
+ # customisation.
36
+ def self.should_be_excluded?(_filepath)
37
+ false
38
+ end
39
+
40
+ # Public: Custom method to be run on the record before indexing it
41
+ #
42
+ # record - The hash of the record to be pushed
43
+ # node - The Nokogiri node of the element
44
+ #
45
+ # Users can modify the record (adding/editing/removing keys) here. It can
46
+ # be used to remove keys that should not be indexed, or access more
47
+ # information from the HTML node.
48
+ #
49
+ # Users can return nil to signal that the record should not be indexed
50
+ def self.before_indexing_each(record, _node)
51
+ record
52
+ end
53
+
54
+ # Public: Custom method to be run on the list of all records before
55
+ # indexing them
56
+ #
57
+ # records - The list of all records to be indexed
58
+ #
59
+ # Users can modify the full list from here. It might provide an easier
60
+ # interface than `hook_before_indexing_each` when knowing the full context
61
+ # is necessary
62
+ def self.before_indexing_all(records)
63
+ records
64
+ end
65
+ end
66
+ end
67
+ end
@@ -0,0 +1,258 @@
1
+ # frozen_string_literal: true
2
+
3
+ require 'algoliasearch'
4
+
5
+ module Jekyll
6
+ module Algolia
7
+ # Module to push records to Algolia and configure the index
8
+ module Indexer
9
+ include Jekyll::Algolia
10
+
11
+ # Public: Init the module
12
+ #
13
+ # This call will instanciate the Algolia API client, set the custom
14
+ # User Agent and give an easy access to the main index
15
+ def self.init
16
+ ::Algolia.init(
17
+ application_id: Configurator.application_id,
18
+ api_key: Configurator.api_key
19
+ )
20
+
21
+ set_user_agent
22
+ end
23
+
24
+ # Public: Set the User-Agent to send to the API
25
+ #
26
+ # Every integrations should follow the "YYY Integration" pattern, and
27
+ # every API client should follow the "Algolia for YYY" pattern. Even if
28
+ # each integration version is pinned to a specific API client version, we
29
+ # are explicit in defining it to help debug from the dashboard.
30
+ def self.set_user_agent
31
+ user_agent = [
32
+ "Jekyll Integration (#{VERSION})",
33
+ "Algolia for Ruby (#{::Algolia::VERSION})",
34
+ "Jekyll (#{::Jekyll::VERSION})",
35
+ "Ruby (#{RUBY_VERSION})"
36
+ ].join('; ')
37
+
38
+ ::Algolia.set_extra_header('User-Agent', user_agent)
39
+ end
40
+
41
+ # Public: Returns an Algolia Index object from an index name
42
+ #
43
+ # index_name - String name of the index
44
+ def self.index(index_name)
45
+ ::Algolia::Index.new(index_name)
46
+ end
47
+
48
+ # Public: Update records of the specified index
49
+ #
50
+ # index - Algolia Index to update
51
+ # records - Array of records to update
52
+ #
53
+ # New records will be automatically added. Technically existing records
54
+ # should be updated but this case should never happen as changing a record
55
+ # content will change its objectID as well.
56
+ #
57
+ # Does nothing in dry run mode
58
+ def self.update_records(index, records)
59
+ batch_size = Configurator.algolia('indexing_batch_size')
60
+ records.each_slice(batch_size) do |batch|
61
+ Logger.log("I:Pushing #{batch.size} records")
62
+ next if Configurator.dry_run?
63
+ begin
64
+ index.add_objects!(batch)
65
+ rescue StandardError => error
66
+ ErrorHandler.stop(error, records: records)
67
+ end
68
+ end
69
+ end
70
+
71
+ # Public: Delete records whose objectIDs are passed
72
+ #
73
+ # index - Algolia Index to target
74
+ # ids - Array of objectIDs to delete
75
+ #
76
+ # Does nothing in dry run mode
77
+ def self.delete_records_by_id(index, ids)
78
+ return if ids.empty?
79
+ Logger.log("I:Deleting #{ids.length} records")
80
+ return if Configurator.dry_run?
81
+
82
+ begin
83
+ index.delete_objects!(ids)
84
+ rescue StandardError => error
85
+ ErrorHandler.stop(error)
86
+ end
87
+ end
88
+
89
+ # Public: Returns an array of all the objectIDs in the index
90
+ #
91
+ # index - Algolia Index to target
92
+ #
93
+ # The returned array is sorted. It won't have any impact on the way it is
94
+ # processed, but makes debugging easier when comparing arrays is needed.
95
+ def self.remote_object_ids(index)
96
+ list = []
97
+ begin
98
+ index.browse(attributesToRetrieve: 'objectID') do |hit|
99
+ list << hit['objectID']
100
+ end
101
+ rescue StandardError
102
+ # The index might not exist if it's the first time we use the plugin
103
+ # so we'll consider that it means there are no records there
104
+ return []
105
+ end
106
+ list.sort
107
+ end
108
+
109
+ # Public: Returns an array of the local objectIDs
110
+ #
111
+ # records - Array of all local records
112
+ def self.local_object_ids(records)
113
+ records.map { |record| record[:objectID] }.compact.sort
114
+ end
115
+
116
+ # Public: Update settings of the index
117
+ #
118
+ # index - The Algolia Index
119
+ # settings - The hash of settings to pass to the index
120
+ #
121
+ # Does nothing in dry run mode
122
+ def self.update_settings(index, settings)
123
+ Logger.verbose('I:Updating settings')
124
+ return if Configurator.dry_run?
125
+ begin
126
+ index.set_settings(settings)
127
+ rescue StandardError => error
128
+ ErrorHandler.stop(error, settings: settings)
129
+ end
130
+ end
131
+
132
+ # Public: Index content following the `diff` indexing mode
133
+ #
134
+ # records - Array of local records
135
+ #
136
+ # The `diff` indexing mode will only push new content to the index and
137
+ # remove old content from it. It won't touch records that haven't been
138
+ # updated. It will be a bit slower as it will first need to get the list
139
+ # of all records in the index, but it will consume less operations.
140
+ def self.run_diff_mode(records)
141
+ index = index(Configurator.index_name)
142
+
143
+ # Update settings
144
+ update_settings(index, Configurator.settings)
145
+
146
+ # Getting list of objectID in remote and locally
147
+ remote_ids = remote_object_ids(index)
148
+ local_ids = local_object_ids(records)
149
+
150
+ old_records_ids = remote_ids - local_ids
151
+ new_records_ids = local_ids - remote_ids
152
+ if old_records_ids.empty? && new_records_ids.empty?
153
+ Logger.log('I:Nothing to index. Your content is already up to date.')
154
+ return
155
+ end
156
+
157
+ Logger.log('I:Pushing records to Algolia...')
158
+
159
+ # Delete remote records that are no longer available locally
160
+ delete_records_by_id(index, old_records_ids)
161
+
162
+ # Add only records that are not yet already in the remote
163
+ new_records = records.select do |record|
164
+ new_records_ids.include?(record[:objectID])
165
+ end
166
+ update_records(index, new_records)
167
+
168
+ Logger.log('I:✔ Indexing complete')
169
+ end
170
+
171
+ # Public: Get the settings of the remote index
172
+ #
173
+ # index - The Algolia Index
174
+ def self.remote_settings(index)
175
+ index.get_settings
176
+ rescue StandardError => error
177
+ ErrorHandler.stop(error)
178
+ end
179
+
180
+ # Public: Rename an index
181
+ #
182
+ # old_name - Current name of the index
183
+ # new_name - New name of the index
184
+ #
185
+ # Does nothing in dry run mode
186
+ def self.rename_index(old_name, new_name)
187
+ Logger.verbose("I:Renaming `#{old_name}` to `#{new_name}`")
188
+ return if Configurator.dry_run?
189
+ begin
190
+ ::Algolia.move_index(old_name, new_name)
191
+ rescue StandardError => error
192
+ ErrorHandler.stop(error, new_name: new_name)
193
+ end
194
+ end
195
+
196
+ # Public: Index content following the `atomic` indexing mode
197
+ #
198
+ # records - Array of records to push
199
+ #
200
+ # The `atomic` indexing mode will push all records to a brand new index,
201
+ # configure it, and then overwrite the previous index with this new one.
202
+ # For the end-user, it will make all the changes in one go, making sure
203
+ # people are always searching into a fully configured index. It will
204
+ # consume more operations, but will never leave the index in a transient
205
+ # state.
206
+ def self.run_atomic_mode(records)
207
+ index_name = Configurator.index_name
208
+ index = index(index_name)
209
+ index_tmp_name = "#{Configurator.index_name}_tmp"
210
+ index_tmp = index(index_tmp_name)
211
+
212
+ Logger.verbose("I:Using `#{index_tmp_name}` as temporary index")
213
+
214
+ # Copying original settings to the new index
215
+ remote_settings = remote_settings(index)
216
+ new_settings = remote_settings.merge(Configurator.settings)
217
+ update_settings(index_tmp, new_settings)
218
+
219
+ # Pushing everthing to a brand new index
220
+ update_records(index_tmp, records)
221
+
222
+ # Renaming the new index in place of the old
223
+ rename_index(index_tmp_name, index_name)
224
+
225
+ Logger.log('I:✔ Indexing complete')
226
+ end
227
+
228
+ # Public: Push all records to Algolia and configure the index
229
+ #
230
+ # records - Records to push
231
+ def self.run(records)
232
+ init
233
+
234
+ record_count = records.length
235
+
236
+ # Indexing zero record is surely a misconfiguration
237
+ if record_count.zero?
238
+ files_to_exclude = Configurator.algolia('files_to_exclude').join(', ')
239
+ Logger.known_message(
240
+ 'no_records_found',
241
+ 'files_to_exclude' => files_to_exclude,
242
+ 'nodes_to_index' => Configurator.algolia('nodes_to_index')
243
+ )
244
+ exit 1
245
+ end
246
+
247
+ indexing_mode = Configurator.indexing_mode
248
+ Logger.verbose("I:Indexing mode: #{indexing_mode}")
249
+ case indexing_mode
250
+ when 'diff'
251
+ run_diff_mode(records)
252
+ when 'atomic'
253
+ run_atomic_mode(records)
254
+ end
255
+ end
256
+ end
257
+ end
258
+ end