algoliasearch-jekyll 0.9.1 → 1.0.0.beta.pre.1

Sign up to get free protection for your applications and to get access to all the features.
Files changed (47) hide show
  1. checksums.yaml +4 -4
  2. data/.travis.yml +3 -4
  3. data/CONTRIBUTING.md +8 -1
  4. data/Gemfile +4 -5
  5. data/README.md +318 -11
  6. data/Rakefile +7 -12
  7. data/algoliasearch-jekyll.gemspec +66 -62
  8. data/gemfiles/jekyll_v2.gemfile +3 -3
  9. data/gemfiles/jekyll_v3.gemfile +4 -4
  10. data/gemfiles/jekyll_v3_1_3.gemfile +24 -0
  11. data/gemfiles/jekyll_v3_1_6.gemfile +24 -0
  12. data/lib/algoliasearch-jekyll.rb +1 -3
  13. data/lib/credential_checker.rb +2 -1
  14. data/lib/error_handler.rb +6 -0
  15. data/lib/push.rb +81 -19
  16. data/lib/record_extractor.rb +120 -140
  17. data/lib/utils.rb +13 -0
  18. data/lib/version.rb +1 -1
  19. data/scripts/release +13 -12
  20. data/scripts/test_v3 +1 -1
  21. data/scripts/watch +4 -0
  22. data/spec/error_handler_spec.rb +17 -0
  23. data/spec/fixtures/jekyll_version_2/404.html +8 -0
  24. data/spec/fixtures/jekyll_version_2/404.md +9 -0
  25. data/spec/fixtures/jekyll_version_2/_my-collection/collection-item.md +3 -0
  26. data/spec/fixtures/jekyll_version_2/_posts/2015-07-02-test-post.md +1 -1
  27. data/spec/fixtures/jekyll_version_2/about.md +3 -0
  28. data/spec/fixtures/jekyll_version_2/front_matter.md +15 -0
  29. data/spec/fixtures/jekyll_version_2/index.html +3 -1
  30. data/spec/fixtures/jekyll_version_2/only-divs.md +15 -0
  31. data/spec/fixtures/jekyll_version_2/only-paragraphs.md +15 -0
  32. data/spec/fixtures/jekyll_version_3/404.html +8 -0
  33. data/spec/fixtures/jekyll_version_3/404.md +9 -0
  34. data/spec/fixtures/jekyll_version_3/_config.yml +1 -1
  35. data/spec/fixtures/jekyll_version_3/_my-collection/collection-item.md +3 -0
  36. data/spec/fixtures/jekyll_version_3/_posts/2015-07-02-test-post.md +1 -1
  37. data/spec/fixtures/jekyll_version_3/about.md +3 -0
  38. data/spec/fixtures/jekyll_version_3/front_matter.md +15 -0
  39. data/spec/fixtures/jekyll_version_3/index.html +4 -1
  40. data/spec/fixtures/jekyll_version_3/only-divs.md +15 -0
  41. data/spec/fixtures/jekyll_version_3/only-paragraphs.md +15 -0
  42. data/spec/push_spec.rb +211 -8
  43. data/spec/record_extractor_spec.rb +296 -358
  44. data/spec/spec_helper.rb +32 -11
  45. data/txt/record_too_big +19 -0
  46. metadata +40 -51
  47. data/scripts/watch +0 -1
@@ -5,8 +5,9 @@ source "http://rubygems.org"
5
5
  gem "algoliasearch", "~> 1.4"
6
6
  gem "appraisal", "~> 2.1.0"
7
7
  gem "awesome_print", "~> 1.6"
8
- gem "json", ">= 1.8.6"
9
- gem "nokogiri", '~> 1.7', '>= 1.7.2'
8
+ gem "json", "~> 1.8"
9
+ gem "nokogiri", "~> 1.6"
10
+ gem "html-hierarchy-extractor", :path => "../../html-hierarchy-extractor/"
10
11
  gem "verbal_expressions", "~> 0.1.5"
11
12
  gem "jekyll", "~> 2.5"
12
13
 
@@ -19,5 +20,4 @@ group :development do
19
20
  gem "rspec", "~> 3.0"
20
21
  gem "rubocop", "~> 0.31"
21
22
  gem "simplecov", "~> 0.10"
22
- gem "rack", "< 2"
23
23
  end
@@ -5,10 +5,11 @@ source "http://rubygems.org"
5
5
  gem "algoliasearch", "~> 1.4"
6
6
  gem "appraisal", "~> 2.1.0"
7
7
  gem "awesome_print", "~> 1.6"
8
- gem "json", ">= 1.8.6"
9
- gem "nokogiri", '~> 1.7', '>= 1.7.2'
8
+ gem "json", "~> 1.8"
9
+ gem "nokogiri", "~> 1.6"
10
+ gem "html-hierarchy-extractor", :path => "../../html-hierarchy-extractor/"
10
11
  gem "verbal_expressions", "~> 0.1.5"
11
- gem "jekyll", "~> 3.0"
12
+ gem "jekyll", "3.1.6"
12
13
  gem "jekyll-paginate", "~> 1.1.0"
13
14
 
14
15
  group :development do
@@ -20,5 +21,4 @@ group :development do
20
21
  gem "rspec", "~> 3.0"
21
22
  gem "rubocop", "~> 0.31"
22
23
  gem "simplecov", "~> 0.10"
23
- gem "rack", "< 2"
24
24
  end
@@ -0,0 +1,24 @@
1
+ # This file was generated by Appraisal
2
+
3
+ source "http://rubygems.org"
4
+
5
+ gem "algoliasearch", "~> 1.4"
6
+ gem "appraisal", "~> 2.1.0"
7
+ gem "awesome_print", "~> 1.6"
8
+ gem "json", "~> 1.8"
9
+ gem "nokogiri", "~> 1.6"
10
+ gem "html-hierarchy-extractor", :path => "../../html-hierarchy-extractor/"
11
+ gem "verbal_expressions", "~> 0.1.5"
12
+ gem "jekyll", "3.1.3"
13
+ gem "jekyll-paginate", "~> 1.1.0"
14
+
15
+ group :development do
16
+ gem "coveralls", "~> 0.8"
17
+ gem "flay", "~> 2.6"
18
+ gem "flog", "~> 4.3"
19
+ gem "guard-rspec", "~> 4.6"
20
+ gem "jeweler", "~> 2.0"
21
+ gem "rspec", "~> 3.0"
22
+ gem "rubocop", "~> 0.31"
23
+ gem "simplecov", "~> 0.10"
24
+ end
@@ -0,0 +1,24 @@
1
+ # This file was generated by Appraisal
2
+
3
+ source "http://rubygems.org"
4
+
5
+ gem "algoliasearch", "~> 1.4"
6
+ gem "appraisal", "~> 2.1.0"
7
+ gem "awesome_print", "~> 1.6"
8
+ gem "json", "~> 1.8"
9
+ gem "nokogiri", "~> 1.6"
10
+ gem "html-hierarchy-extractor", :path => "../../html-hierarchy-extractor/"
11
+ gem "verbal_expressions", "~> 0.1.5"
12
+ gem "jekyll", "3.1.6"
13
+ gem "jekyll-paginate", "~> 1.1.0"
14
+
15
+ group :development do
16
+ gem "coveralls", "~> 0.8"
17
+ gem "flay", "~> 2.6"
18
+ gem "flog", "~> 4.3"
19
+ gem "guard-rspec", "~> 4.6"
20
+ gem "jeweler", "~> 2.0"
21
+ gem "rspec", "~> 3.0"
22
+ gem "rubocop", "~> 0.31"
23
+ gem "simplecov", "~> 0.10"
24
+ end
@@ -1,12 +1,10 @@
1
1
  require 'rubygems'
2
2
  require 'bundler/setup'
3
-
4
3
  require 'awesome_print'
5
-
6
4
  require_relative './version'
7
5
  require_relative './push'
8
6
 
9
- # `jekyll algolia` main entry
7
+ # Registering the `jekyll algolia push` command
10
8
  class AlgoliaSearchJekyll < Jekyll::Command
11
9
  class << self
12
10
  def init_with_program(prog)
@@ -3,7 +3,8 @@ require 'nokogiri'
3
3
  require 'json'
4
4
  require_relative './error_handler.rb'
5
5
 
6
- # Given an HTML file as input, will return an array of records to index
6
+ # Will check that all the needed credentials are correctly given by the user
7
+ # before starting any push process
7
8
  class AlgoliaSearchCredentialChecker
8
9
  attr_accessor :config, :logger
9
10
 
@@ -82,6 +82,12 @@ class AlgoliaSearchErrorHandler
82
82
  return 'check_key_acl_to_tmp_index'
83
83
  end
84
84
 
85
+ # Pushed record is above the 10KB limit
86
+ if error['http_error'] == 400 &&
87
+ error['json']['message'] =~ /^Record is too big/
88
+ return 'record_too_big'
89
+ end
90
+
85
91
  false
86
92
  end
87
93
  end
@@ -1,12 +1,12 @@
1
1
  require 'algoliasearch'
2
- require 'nokogiri'
3
2
  require 'json'
3
+ require 'nokogiri'
4
4
  require_relative './version'
5
5
  require_relative './record_extractor'
6
6
  require_relative './credential_checker'
7
7
  require_relative './error_handler'
8
8
 
9
- # `jekyll algolia push` command
9
+ # `jekyll algolia push` main command
10
10
  class AlgoliaSearchJekyllPush < Jekyll::Command
11
11
  class << self
12
12
  attr_accessor :options, :config
@@ -22,30 +22,42 @@ class AlgoliaSearchJekyllPush < Jekyll::Command
22
22
  @args = args
23
23
  @options = options
24
24
  @config = config
25
+ @checker = AlgoliaSearchCredentialChecker.new(@config)
25
26
  @is_verbose = @config['verbose']
26
27
  @is_dry_run = @config['dry_run']
28
+ @is_lazy_update = lazy_update?
27
29
 
28
30
  self
29
31
  end
30
32
 
33
+ # Check if the lazy update feature is enabled or not (default to false)
34
+ def lazy_update?
35
+ return false unless @config['algolia']
36
+ return true if @config['algolia']['lazy_update']
37
+ false
38
+ end
39
+
31
40
  # Check if the specified file should be indexed (we exclude static files,
32
41
  # robots.txt and custom defined exclusions).
33
42
  def indexable?(file)
43
+ # Excluding all static assets (images, fonts, etc)
34
44
  return false if file.is_a?(Jekyll::StaticFile)
35
45
 
36
- basename = File.basename(file.path)
37
- extname = File.extname(basename)[1..-1]
38
-
39
- # Keep only markdown and html files
46
+ # Jekyll auto-converts markdown to HTML, so if the file is neither
47
+ # markdown or HTML, we should probably not index it
40
48
  allowed_extensions = %w(html)
41
49
  if @config['markdown_ext']
42
50
  allowed_extensions += @config['markdown_ext'].split(',')
43
51
  end
44
- if @config['algolia']
45
- allowed_extensions += (@config['algolia']['allowed_extensions'] || [])
46
- end
47
- return false unless allowed_extensions.include?(extname)
52
+ extname = File.extname(File.basename(file.path))
53
+ return false unless allowed_extensions.include?(extname[1..-1])
48
54
 
55
+ # We should not index GitHub pages 404 pages
56
+ # https://help.github.com/articles/creating-a-custom-404-page-for-your-github-pages-site/
57
+ basename_no_ext = File.basename(file.path, extname)
58
+ return false if basename_no_ext == '404'
59
+
60
+ # Users can also define their own blacklist and hooks to exclude files
49
61
  return false if excluded_file?(file)
50
62
 
51
63
  true
@@ -53,14 +65,17 @@ class AlgoliaSearchJekyllPush < Jekyll::Command
53
65
 
54
66
  # Check if the file is in the list of excluded files
55
67
  def excluded_file?(file)
68
+ # Blacklist of pages generated by Jekyll that we know should not be
69
+ # indexing
56
70
  excluded = [
57
- %r{^page([0-9]*)/index\.html}
71
+ /^index\.html$/, # Index page
72
+ %r{^page([0-9]*)/index\.html} # Pagination pages
58
73
  ]
74
+ # User-provided blacklist
59
75
  if @config['algolia']
60
76
  excluded += (@config['algolia']['excluded_files'] || [])
61
77
  end
62
78
 
63
- # Exclude files explicitly excluded in _config
64
79
  excluded.each do |pattern|
65
80
  pattern = /#{Regexp.quote(pattern)}/ if pattern.is_a? String
66
81
  return true if file.path =~ pattern
@@ -88,14 +103,17 @@ class AlgoliaSearchJekyllPush < Jekyll::Command
88
103
  items = []
89
104
  is_verbose = config['verbose']
90
105
  each_site_file do |file|
106
+ # Skip files that should not be indexed
91
107
  next unless AlgoliaSearchJekyllPush.indexable?(file)
92
108
  Jekyll.logger.info "Extracting data from #{file.path}" if is_verbose
109
+
93
110
  new_items = AlgoliaSearchRecordExtractor.new(file).extract
94
111
  next if new_items.nil?
95
112
  ap new_items if is_verbose
96
113
 
97
114
  items += new_items
98
115
  end
116
+
99
117
  AlgoliaSearchJekyllPush.push(items)
100
118
  end
101
119
 
@@ -178,14 +196,11 @@ class AlgoliaSearchJekyllPush < Jekyll::Command
178
196
  end
179
197
  end
180
198
 
181
- def push(items)
182
- checker = AlgoliaSearchCredentialChecker.new(@config)
183
- checker.assert_valid
184
-
185
- Jekyll.logger.info '=== DRY RUN ===' if @is_dry_run
186
-
199
+ # Greedy update will push all the records to a temporary index, then
200
+ # override the existing index with this temp one
201
+ def greedy_update(items)
187
202
  # Add items to a temp index, then rename it
188
- index_name = checker.index_name
203
+ index_name = @checker.index_name
189
204
  index_name_tmp = "#{index_name}_tmp"
190
205
  batch_add_items(items, create_index(index_name_tmp))
191
206
  Algolia.move_index(index_name_tmp, index_name) unless @is_dry_run
@@ -193,5 +208,52 @@ class AlgoliaSearchJekyllPush < Jekyll::Command
193
208
  Jekyll.logger.info "Indexing of #{items.size} items " \
194
209
  "in #{index_name} done."
195
210
  end
211
+
212
+ # Lazy update will minimize the number of operations by only pushing new
213
+ # data and deleting old data
214
+ def lazy_update(items)
215
+ index = create_index(@checker.index_name)
216
+ remote = remote_ids(index)
217
+ local = items.map { |item| item[:objectID] }
218
+
219
+ delete_remote_not_in_local(index, local, remote)
220
+
221
+ add_local_not_in_remote(index, items, local, remote)
222
+ end
223
+
224
+ # Array of all objectID in the remote index
225
+ def remote_ids(index)
226
+ list = []
227
+ index.browse(attributesToRetrieve: 'objectID') do |hit|
228
+ list << hit['objectID']
229
+ end
230
+ list
231
+ end
232
+
233
+ # Delete all remote items that are no longer in the local items
234
+ def delete_remote_not_in_local(index, local, remote)
235
+ list = remote - local
236
+ Jekyll.logger.info "Deleting #{list.size} items"
237
+ index.delete_objects!(list) unless list.empty?
238
+ end
239
+
240
+ # Push all local items that are not yet in the index
241
+ def add_local_not_in_remote(index, items, local, remote)
242
+ list = local - remote
243
+ return Jekyll.logger.info "Adding #{list.size} items" if list.empty?
244
+ items_to_push = items.select do |item|
245
+ list.include?(item[:objectID])
246
+ end
247
+ batch_add_items(items_to_push, index)
248
+ end
249
+
250
+ def push(items)
251
+ checker = AlgoliaSearchCredentialChecker.new(@config)
252
+ checker.assert_valid
253
+
254
+ Jekyll.logger.info '=== DRY RUN ===' if @is_dry_run
255
+
256
+ @is_lazy_update ? lazy_update(items) : greedy_update(items)
257
+ end
196
258
  end
197
259
  end
@@ -1,6 +1,8 @@
1
1
  require 'algoliasearch'
2
2
  require 'nokogiri'
3
3
  require 'json'
4
+ require 'html-hierarchy-extractor'
5
+ require_relative './utils'
4
6
 
5
7
  # Given an HTML file as input, will return an array of records to index
6
8
  class AlgoliaSearchRecordExtractor
@@ -25,189 +27,167 @@ class AlgoliaSearchRecordExtractor
25
27
  items
26
28
  end
27
29
 
28
- # Returns metadata from the current file
29
- def metadata
30
- metadata = {}
31
- @file.data.each { |key, value| metadata[key.to_sym] = value }
30
+ ##
31
+ # Return the type of the Jekyll element
32
+ # It can be either page, post or document
33
+ def type
34
+ classname = @file.class.name
35
+ subclass = classname.split('::')[1]
36
+ type = subclass.downcase
32
37
 
33
- metadata[:type] = @file.class.name.split('::')[1].downcase
34
- metadata[:url] = @file.url
38
+ # In Jekyll v2, Page, Post and Document have their own class
39
+ return type if AlgoliaSearchUtils.restrict_jekyll_version(less_than: '3.0')
35
40
 
36
- metadata[:slug] = slug
41
+ # In Jekyll v3, Post are actually a specific type of Documents
42
+ if type == 'document'
43
+ collection_name = @file.collection.label
44
+ return 'post' if collection_name == 'posts'
45
+ end
37
46
 
38
- metadata[:posted_at] = @file.date.to_time.to_i if @file.respond_to? :date
39
- metadata[:tags] = tags
47
+ type
48
+ end
40
49
 
41
- metadata
50
+ ##
51
+ # Return the url of the page
52
+ def url
53
+ @file.url
42
54
  end
43
55
 
56
+ ##
57
+ # Return the title of the page
58
+ def title
59
+ @file.data['title']
60
+ end
61
+
62
+ ##
44
63
  # Returns the slug of the document
45
64
  def slug
46
- # Jekyll v3 has it in data
47
- return @file.data['slug'] if @file.data.key?('slug')
48
- # Old Jekyll v2 has it at the root
49
- return @file.slug if @file.respond_to? :slug
50
- # Otherwise, we guess it from the filename
65
+ # We can guess the slug from the filename for all documents
51
66
  basename = File.basename(@file.path)
52
67
  extname = File.extname(basename)
53
- File.basename(basename, extname)
54
- end
55
-
56
- # Extract a list of tags
57
- def tags
58
- tags = nil
68
+ slug = File.basename(basename, extname)
59
69
 
60
- # Jekyll v3 has it in data, while v2 have it at the root
61
- if @file.data.key?('tags')
62
- tags = @file.data['tags']
63
- elsif @file.respond_to? :tags
64
- tags = @file.tags
65
- end
66
-
67
- return tags if tags.nil?
70
+ # Jekyll v3 posts have it in data
71
+ return @file.data['slug'] if @file.data.key?('slug')
68
72
 
69
- # Anyway, we force cast it to string as some plugins will extend the tags to
70
- # full featured objects
71
- tags.map(&:to_s)
72
- end
73
+ # Jekyll v2 posts have a specific slug method
74
+ return @file.slug if @file.respond_to?(:slug)
73
75
 
74
- # Get the list of all HTML nodes to index
75
- def html_nodes
76
- document = Nokogiri::HTML(@file.content)
77
- document.css(@config['record_css_selector'])
76
+ slug
78
77
  end
79
78
 
80
- # Check if node is a heading
81
- def node_heading?(node)
82
- %w(h1 h2 h3 h4 h5 h6).include?(node.name)
83
- end
79
+ ##
80
+ # Get an array of tags of the document
81
+ def tags
82
+ tags = []
84
83
 
85
- # Get the closest heading parent
86
- def node_heading_parent(node, level = 'h7')
87
- # If initially called on a heading, we only accept stronger headings
88
- level = node.name if level == 'h7' && node_heading?(node)
84
+ is_v2 = AlgoliaSearchUtils.restrict_jekyll_version(less_than: '3.0')
85
+ is_v3 = AlgoliaSearchUtils.restrict_jekyll_version(more_than: '3.0')
86
+ has_tags_method = @file.respond_to?(:tags)
87
+ has_tags_data = @file.data.key?('tags')
89
88
 
90
- previous = node.previous_element
89
+ # Starting from Jekyll v3, all tags are in data['tags']
90
+ tags = @file.data['tags'] if is_v3 && has_tags_data
91
91
 
92
- # No previous element, we go up to the parent
93
- unless previous
94
- parent = node.parent
95
- # No more parent, then no heading found
96
- return nil if parent.name == 'body'
97
- return node_heading_parent(parent, level)
92
+ # In Jekyll v2, tags are in data['tags'], or in .tags
93
+ if is_v2
94
+ tags = @file.tags if has_tags_method
95
+ tags = @file.data['tags'] if tags.empty? && has_tags_data
98
96
  end
99
97
 
100
- # This is a heading, we return it
101
- return previous if node_heading?(previous) && previous.name < level
102
-
103
- node_heading_parent(previous, level)
98
+ # Some extension extends the tags with custom classes, so we make sure we
99
+ # cast them as strings
100
+ tags.map(&:to_s)
104
101
  end
105
102
 
106
- # Get all the parent headings of the specified node
107
- # If the node itself is a heading, we include it
108
- def node_hierarchy(node, state = { level: 7 })
109
- tag_name = node.name
110
- level = tag_name.delete('h').to_i
103
+ ##
104
+ # Get the post date timestamp
105
+ def date
106
+ return nil unless @file.respond_to?(:date)
111
107
 
112
- if node_heading?(node) && level < state[:level]
113
- state[tag_name.to_sym] = node_text(node)
114
- state[:level] = level
115
- end
116
-
117
- heading = node_heading_parent(node)
118
-
119
- # No previous heading, we can stop the recursion
120
- unless heading
121
- state.delete(:level)
122
- return state
123
- end
124
-
125
- node_hierarchy(heading, state)
108
+ @file.date.to_time.to_i
126
109
  end
127
110
 
128
- # Return the raw HTML of the element to index
129
- def node_raw_html(node)
130
- node.to_s
131
- end
111
+ ##
112
+ # Get the collection name of a document
113
+ def collection
114
+ return nil unless @file.respond_to?(:collection)
132
115
 
133
- # Return the text of the element, sanitized to be displayed
134
- def node_text(node)
135
- node.content.gsub('<', '&lt;').gsub('>', '&gt;')
136
- end
116
+ collection_name = @file.collection.label
137
117
 
138
- # Returns a unique string of hierarchy from title to h6, used for distinct
139
- def unique_hierarchy(data)
140
- headings = %w(title h1 h2 h3 h4 h5 h6)
141
- headings.map { |heading| data[heading.to_sym] }.compact.join(' > ')
118
+ # In Jekyll v3, posts are actually a collection
119
+ return nil if collection_name == 'posts'
120
+ collection_name
142
121
  end
143
122
 
144
- # Returns a hash of two CSS selectors. One for the node itself, and one its
145
- # closest heading parent
146
- def node_css_selector(node)
147
- return nil if node.nil?
123
+ ##
124
+ # Get a hash of all front-matter data
125
+ def front_matter
126
+ raw_data = @file.data
148
127
 
149
- # Use the CSS id if one is set
150
- return "##{node['id']}" if node['id']
151
-
152
- # Default Nokogiri selector
153
- node.css_path.gsub('html > body > ', '')
154
- end
128
+ # We clean some keys that will be handled by specific methods
129
+ attributes_to_remove = %w(title tags slug url date type)
130
+ attributes_to_remove.each do |attribute|
131
+ raw_data.delete(attribute)
132
+ end
155
133
 
156
- # The more words are in common between this node and its parent heading, the
157
- # higher the score
158
- def weight_heading_relevance(data)
159
- # Get list of unique words in headings
160
- title_words = %i(title h1 h2 h3 h4 h5 h6)
161
- .select { |title| data.key?(title) }
162
- .map { |title| data[title].to_s.split(/\W+/) }
163
- .flatten
164
- .compact
165
- .map(&:downcase)
166
- .uniq
167
- # Intersect words in headings with words in test
168
- text_words = data[:text].downcase.split(/\W+/)
169
- (title_words & text_words).size
170
- end
134
+ # Convert to symbols
135
+ data = {}
136
+ raw_data.each do |key, value|
137
+ data[key.to_sym] = value
138
+ end
171
139
 
172
- # Returns a weight based on the tag_name
173
- def weight_tag_name(item)
174
- tag_name = item[:tag_name]
175
- # No a heading, no weight
176
- return 0 unless %w(h1 h2 h3 h4 h5 h6).include?(tag_name)
177
- # h1: 100, h2: 90, ..., h6: 50
178
- 100 - (tag_name.delete('h').to_i - 1) * 10
140
+ data
179
141
  end
180
142
 
181
- # Returns an object of all weights
182
- def weight(item, index)
183
- {
184
- tag_name: weight_tag_name(item),
185
- heading_relevance: weight_heading_relevance(item),
186
- position: index
143
+ ##
144
+ # Get the list of all node data
145
+ def hierarchy_nodes
146
+ extractor_options = {
147
+ css_selector: @config['record_css_selector']
187
148
  }
149
+
150
+ HTMLHierarchyExtractor.new(
151
+ @file.content,
152
+ options: extractor_options
153
+ ).extract
188
154
  end
189
155
 
156
+ # Extract all records from the page and return the list
190
157
  def extract
158
+ # Getting all hierarchical nodes from the HTML input
159
+ raw_items = hierarchy_nodes
160
+
161
+ # Shared attributes relative to the page that all records will have
162
+ shared_attributes = {
163
+ type: type,
164
+ url: url,
165
+ title: title,
166
+ slug: slug,
167
+ date: date,
168
+ collection: collection,
169
+ tags: tags
170
+ }
171
+ # Remove empty attributes
172
+ shared_attributes = shared_attributes.delete_if do |_, value|
173
+ value.nil?
174
+ end
175
+
176
+ # Enriching with page metadata
191
177
  items = []
192
- html_nodes.each_with_index do |node, index|
193
- next if node.text.empty?
194
-
195
- item = metadata.clone
196
- item.merge!(node_hierarchy(node))
197
- item[:tag_name] = node.name
198
- item[:raw_html] = node_raw_html(node)
199
- item[:text] = node_text(node)
200
- item[:unique_hierarchy] = unique_hierarchy(item)
201
- item[:css_selector] = node_css_selector(node)
202
- item[:css_selector_parent] = node_css_selector(node_heading_parent(node))
203
- item[:weight] = weight(item, index)
204
-
205
- # We pass item through the user defined custom hook
206
- item = custom_hook_each(item, node)
178
+ raw_items.each do |raw_item|
179
+ nokogiri_node = raw_item[:node]
180
+ raw_item.delete(:node)
181
+ item = shared_attributes.merge(raw_item)
182
+ item[:objectID] = item[:uuid]
183
+ item.delete(:uuid)
184
+
185
+ item = custom_hook_each(item, nokogiri_node)
207
186
  next if item.nil?
208
187
 
209
188
  items << item
210
189
  end
190
+
211
191
  custom_hook_all(items)
212
192
  end
213
193
  end