algoliasearch-jekyll 0.9.1 → 1.0.0.beta.pre.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (47) hide show
  1. checksums.yaml +4 -4
  2. data/.travis.yml +3 -4
  3. data/CONTRIBUTING.md +8 -1
  4. data/Gemfile +4 -5
  5. data/README.md +318 -11
  6. data/Rakefile +7 -12
  7. data/algoliasearch-jekyll.gemspec +66 -62
  8. data/gemfiles/jekyll_v2.gemfile +3 -3
  9. data/gemfiles/jekyll_v3.gemfile +4 -4
  10. data/gemfiles/jekyll_v3_1_3.gemfile +24 -0
  11. data/gemfiles/jekyll_v3_1_6.gemfile +24 -0
  12. data/lib/algoliasearch-jekyll.rb +1 -3
  13. data/lib/credential_checker.rb +2 -1
  14. data/lib/error_handler.rb +6 -0
  15. data/lib/push.rb +81 -19
  16. data/lib/record_extractor.rb +120 -140
  17. data/lib/utils.rb +13 -0
  18. data/lib/version.rb +1 -1
  19. data/scripts/release +13 -12
  20. data/scripts/test_v3 +1 -1
  21. data/scripts/watch +4 -0
  22. data/spec/error_handler_spec.rb +17 -0
  23. data/spec/fixtures/jekyll_version_2/404.html +8 -0
  24. data/spec/fixtures/jekyll_version_2/404.md +9 -0
  25. data/spec/fixtures/jekyll_version_2/_my-collection/collection-item.md +3 -0
  26. data/spec/fixtures/jekyll_version_2/_posts/2015-07-02-test-post.md +1 -1
  27. data/spec/fixtures/jekyll_version_2/about.md +3 -0
  28. data/spec/fixtures/jekyll_version_2/front_matter.md +15 -0
  29. data/spec/fixtures/jekyll_version_2/index.html +3 -1
  30. data/spec/fixtures/jekyll_version_2/only-divs.md +15 -0
  31. data/spec/fixtures/jekyll_version_2/only-paragraphs.md +15 -0
  32. data/spec/fixtures/jekyll_version_3/404.html +8 -0
  33. data/spec/fixtures/jekyll_version_3/404.md +9 -0
  34. data/spec/fixtures/jekyll_version_3/_config.yml +1 -1
  35. data/spec/fixtures/jekyll_version_3/_my-collection/collection-item.md +3 -0
  36. data/spec/fixtures/jekyll_version_3/_posts/2015-07-02-test-post.md +1 -1
  37. data/spec/fixtures/jekyll_version_3/about.md +3 -0
  38. data/spec/fixtures/jekyll_version_3/front_matter.md +15 -0
  39. data/spec/fixtures/jekyll_version_3/index.html +4 -1
  40. data/spec/fixtures/jekyll_version_3/only-divs.md +15 -0
  41. data/spec/fixtures/jekyll_version_3/only-paragraphs.md +15 -0
  42. data/spec/push_spec.rb +211 -8
  43. data/spec/record_extractor_spec.rb +296 -358
  44. data/spec/spec_helper.rb +32 -11
  45. data/txt/record_too_big +19 -0
  46. metadata +40 -51
  47. data/scripts/watch +0 -1
@@ -5,8 +5,9 @@ source "http://rubygems.org"
5
5
  gem "algoliasearch", "~> 1.4"
6
6
  gem "appraisal", "~> 2.1.0"
7
7
  gem "awesome_print", "~> 1.6"
8
- gem "json", ">= 1.8.6"
9
- gem "nokogiri", '~> 1.7', '>= 1.7.2'
8
+ gem "json", "~> 1.8"
9
+ gem "nokogiri", "~> 1.6"
10
+ gem "html-hierarchy-extractor", :path => "../../html-hierarchy-extractor/"
10
11
  gem "verbal_expressions", "~> 0.1.5"
11
12
  gem "jekyll", "~> 2.5"
12
13
 
@@ -19,5 +20,4 @@ group :development do
19
20
  gem "rspec", "~> 3.0"
20
21
  gem "rubocop", "~> 0.31"
21
22
  gem "simplecov", "~> 0.10"
22
- gem "rack", "< 2"
23
23
  end
@@ -5,10 +5,11 @@ source "http://rubygems.org"
5
5
  gem "algoliasearch", "~> 1.4"
6
6
  gem "appraisal", "~> 2.1.0"
7
7
  gem "awesome_print", "~> 1.6"
8
- gem "json", ">= 1.8.6"
9
- gem "nokogiri", '~> 1.7', '>= 1.7.2'
8
+ gem "json", "~> 1.8"
9
+ gem "nokogiri", "~> 1.6"
10
+ gem "html-hierarchy-extractor", :path => "../../html-hierarchy-extractor/"
10
11
  gem "verbal_expressions", "~> 0.1.5"
11
- gem "jekyll", "~> 3.0"
12
+ gem "jekyll", "3.1.6"
12
13
  gem "jekyll-paginate", "~> 1.1.0"
13
14
 
14
15
  group :development do
@@ -20,5 +21,4 @@ group :development do
20
21
  gem "rspec", "~> 3.0"
21
22
  gem "rubocop", "~> 0.31"
22
23
  gem "simplecov", "~> 0.10"
23
- gem "rack", "< 2"
24
24
  end
@@ -0,0 +1,24 @@
1
+ # This file was generated by Appraisal
2
+
3
+ source "http://rubygems.org"
4
+
5
+ gem "algoliasearch", "~> 1.4"
6
+ gem "appraisal", "~> 2.1.0"
7
+ gem "awesome_print", "~> 1.6"
8
+ gem "json", "~> 1.8"
9
+ gem "nokogiri", "~> 1.6"
10
+ gem "html-hierarchy-extractor", :path => "../../html-hierarchy-extractor/"
11
+ gem "verbal_expressions", "~> 0.1.5"
12
+ gem "jekyll", "3.1.3"
13
+ gem "jekyll-paginate", "~> 1.1.0"
14
+
15
+ group :development do
16
+ gem "coveralls", "~> 0.8"
17
+ gem "flay", "~> 2.6"
18
+ gem "flog", "~> 4.3"
19
+ gem "guard-rspec", "~> 4.6"
20
+ gem "jeweler", "~> 2.0"
21
+ gem "rspec", "~> 3.0"
22
+ gem "rubocop", "~> 0.31"
23
+ gem "simplecov", "~> 0.10"
24
+ end
@@ -0,0 +1,24 @@
1
+ # This file was generated by Appraisal
2
+
3
+ source "http://rubygems.org"
4
+
5
+ gem "algoliasearch", "~> 1.4"
6
+ gem "appraisal", "~> 2.1.0"
7
+ gem "awesome_print", "~> 1.6"
8
+ gem "json", "~> 1.8"
9
+ gem "nokogiri", "~> 1.6"
10
+ gem "html-hierarchy-extractor", :path => "../../html-hierarchy-extractor/"
11
+ gem "verbal_expressions", "~> 0.1.5"
12
+ gem "jekyll", "3.1.6"
13
+ gem "jekyll-paginate", "~> 1.1.0"
14
+
15
+ group :development do
16
+ gem "coveralls", "~> 0.8"
17
+ gem "flay", "~> 2.6"
18
+ gem "flog", "~> 4.3"
19
+ gem "guard-rspec", "~> 4.6"
20
+ gem "jeweler", "~> 2.0"
21
+ gem "rspec", "~> 3.0"
22
+ gem "rubocop", "~> 0.31"
23
+ gem "simplecov", "~> 0.10"
24
+ end
@@ -1,12 +1,10 @@
1
1
  require 'rubygems'
2
2
  require 'bundler/setup'
3
-
4
3
  require 'awesome_print'
5
-
6
4
  require_relative './version'
7
5
  require_relative './push'
8
6
 
9
- # `jekyll algolia` main entry
7
+ # Registering the `jekyll algolia push` command
10
8
  class AlgoliaSearchJekyll < Jekyll::Command
11
9
  class << self
12
10
  def init_with_program(prog)
@@ -3,7 +3,8 @@ require 'nokogiri'
3
3
  require 'json'
4
4
  require_relative './error_handler.rb'
5
5
 
6
- # Given an HTML file as input, will return an array of records to index
6
+ # Will check that all the needed credentials are correctly given by the user
7
+ # before starting any push process
7
8
  class AlgoliaSearchCredentialChecker
8
9
  attr_accessor :config, :logger
9
10
 
@@ -82,6 +82,12 @@ class AlgoliaSearchErrorHandler
82
82
  return 'check_key_acl_to_tmp_index'
83
83
  end
84
84
 
85
+ # Pushed record is above the 10KB limit
86
+ if error['http_error'] == 400 &&
87
+ error['json']['message'] =~ /^Record is too big/
88
+ return 'record_too_big'
89
+ end
90
+
85
91
  false
86
92
  end
87
93
  end
@@ -1,12 +1,12 @@
1
1
  require 'algoliasearch'
2
- require 'nokogiri'
3
2
  require 'json'
3
+ require 'nokogiri'
4
4
  require_relative './version'
5
5
  require_relative './record_extractor'
6
6
  require_relative './credential_checker'
7
7
  require_relative './error_handler'
8
8
 
9
- # `jekyll algolia push` command
9
+ # `jekyll algolia push` main command
10
10
  class AlgoliaSearchJekyllPush < Jekyll::Command
11
11
  class << self
12
12
  attr_accessor :options, :config
@@ -22,30 +22,42 @@ class AlgoliaSearchJekyllPush < Jekyll::Command
22
22
  @args = args
23
23
  @options = options
24
24
  @config = config
25
+ @checker = AlgoliaSearchCredentialChecker.new(@config)
25
26
  @is_verbose = @config['verbose']
26
27
  @is_dry_run = @config['dry_run']
28
+ @is_lazy_update = lazy_update?
27
29
 
28
30
  self
29
31
  end
30
32
 
33
+ # Check if the lazy update feature is enabled or not (default to false)
34
+ def lazy_update?
35
+ return false unless @config['algolia']
36
+ return true if @config['algolia']['lazy_update']
37
+ false
38
+ end
39
+
31
40
  # Check if the specified file should be indexed (we exclude static files,
32
41
  # robots.txt and custom defined exclusions).
33
42
  def indexable?(file)
43
+ # Excluding all static assets (images, fonts, etc)
34
44
  return false if file.is_a?(Jekyll::StaticFile)
35
45
 
36
- basename = File.basename(file.path)
37
- extname = File.extname(basename)[1..-1]
38
-
39
- # Keep only markdown and html files
46
+ # Jekyll auto-converts markdown to HTML, so if the file is neither
47
+ # markdown or HTML, we should probably not index it
40
48
  allowed_extensions = %w(html)
41
49
  if @config['markdown_ext']
42
50
  allowed_extensions += @config['markdown_ext'].split(',')
43
51
  end
44
- if @config['algolia']
45
- allowed_extensions += (@config['algolia']['allowed_extensions'] || [])
46
- end
47
- return false unless allowed_extensions.include?(extname)
52
+ extname = File.extname(File.basename(file.path))
53
+ return false unless allowed_extensions.include?(extname[1..-1])
48
54
 
55
+ # We should not index GitHub pages 404 pages
56
+ # https://help.github.com/articles/creating-a-custom-404-page-for-your-github-pages-site/
57
+ basename_no_ext = File.basename(file.path, extname)
58
+ return false if basename_no_ext == '404'
59
+
60
+ # Users can also define their own blacklist and hooks to exclude files
49
61
  return false if excluded_file?(file)
50
62
 
51
63
  true
@@ -53,14 +65,17 @@ class AlgoliaSearchJekyllPush < Jekyll::Command
53
65
 
54
66
  # Check if the file is in the list of excluded files
55
67
  def excluded_file?(file)
68
+ # Blacklist of pages generated by Jekyll that we know should not be
69
+ # indexing
56
70
  excluded = [
57
- %r{^page([0-9]*)/index\.html}
71
+ /^index\.html$/, # Index page
72
+ %r{^page([0-9]*)/index\.html} # Pagination pages
58
73
  ]
74
+ # User-provided blacklist
59
75
  if @config['algolia']
60
76
  excluded += (@config['algolia']['excluded_files'] || [])
61
77
  end
62
78
 
63
- # Exclude files explicitly excluded in _config
64
79
  excluded.each do |pattern|
65
80
  pattern = /#{Regexp.quote(pattern)}/ if pattern.is_a? String
66
81
  return true if file.path =~ pattern
@@ -88,14 +103,17 @@ class AlgoliaSearchJekyllPush < Jekyll::Command
88
103
  items = []
89
104
  is_verbose = config['verbose']
90
105
  each_site_file do |file|
106
+ # Skip files that should not be indexed
91
107
  next unless AlgoliaSearchJekyllPush.indexable?(file)
92
108
  Jekyll.logger.info "Extracting data from #{file.path}" if is_verbose
109
+
93
110
  new_items = AlgoliaSearchRecordExtractor.new(file).extract
94
111
  next if new_items.nil?
95
112
  ap new_items if is_verbose
96
113
 
97
114
  items += new_items
98
115
  end
116
+
99
117
  AlgoliaSearchJekyllPush.push(items)
100
118
  end
101
119
 
@@ -178,14 +196,11 @@ class AlgoliaSearchJekyllPush < Jekyll::Command
178
196
  end
179
197
  end
180
198
 
181
- def push(items)
182
- checker = AlgoliaSearchCredentialChecker.new(@config)
183
- checker.assert_valid
184
-
185
- Jekyll.logger.info '=== DRY RUN ===' if @is_dry_run
186
-
199
+ # Greedy update will push all the records to a temporary index, then
200
+ # override the existing index with this temp one
201
+ def greedy_update(items)
187
202
  # Add items to a temp index, then rename it
188
- index_name = checker.index_name
203
+ index_name = @checker.index_name
189
204
  index_name_tmp = "#{index_name}_tmp"
190
205
  batch_add_items(items, create_index(index_name_tmp))
191
206
  Algolia.move_index(index_name_tmp, index_name) unless @is_dry_run
@@ -193,5 +208,52 @@ class AlgoliaSearchJekyllPush < Jekyll::Command
193
208
  Jekyll.logger.info "Indexing of #{items.size} items " \
194
209
  "in #{index_name} done."
195
210
  end
211
+
212
+ # Lazy update will minimize the number of operations by only pushing new
213
+ # data and deleting old data
214
+ def lazy_update(items)
215
+ index = create_index(@checker.index_name)
216
+ remote = remote_ids(index)
217
+ local = items.map { |item| item[:objectID] }
218
+
219
+ delete_remote_not_in_local(index, local, remote)
220
+
221
+ add_local_not_in_remote(index, items, local, remote)
222
+ end
223
+
224
+ # Array of all objectID in the remote index
225
+ def remote_ids(index)
226
+ list = []
227
+ index.browse(attributesToRetrieve: 'objectID') do |hit|
228
+ list << hit['objectID']
229
+ end
230
+ list
231
+ end
232
+
233
+ # Delete all remote items that are no longer in the local items
234
+ def delete_remote_not_in_local(index, local, remote)
235
+ list = remote - local
236
+ Jekyll.logger.info "Deleting #{list.size} items"
237
+ index.delete_objects!(list) unless list.empty?
238
+ end
239
+
240
+ # Push all local items that are not yet in the index
241
+ def add_local_not_in_remote(index, items, local, remote)
242
+ list = local - remote
243
+ return Jekyll.logger.info "Adding #{list.size} items" if list.empty?
244
+ items_to_push = items.select do |item|
245
+ list.include?(item[:objectID])
246
+ end
247
+ batch_add_items(items_to_push, index)
248
+ end
249
+
250
+ def push(items)
251
+ checker = AlgoliaSearchCredentialChecker.new(@config)
252
+ checker.assert_valid
253
+
254
+ Jekyll.logger.info '=== DRY RUN ===' if @is_dry_run
255
+
256
+ @is_lazy_update ? lazy_update(items) : greedy_update(items)
257
+ end
196
258
  end
197
259
  end
@@ -1,6 +1,8 @@
1
1
  require 'algoliasearch'
2
2
  require 'nokogiri'
3
3
  require 'json'
4
+ require 'html-hierarchy-extractor'
5
+ require_relative './utils'
4
6
 
5
7
  # Given an HTML file as input, will return an array of records to index
6
8
  class AlgoliaSearchRecordExtractor
@@ -25,189 +27,167 @@ class AlgoliaSearchRecordExtractor
25
27
  items
26
28
  end
27
29
 
28
- # Returns metadata from the current file
29
- def metadata
30
- metadata = {}
31
- @file.data.each { |key, value| metadata[key.to_sym] = value }
30
+ ##
31
+ # Return the type of the Jekyll element
32
+ # It can be either page, post or document
33
+ def type
34
+ classname = @file.class.name
35
+ subclass = classname.split('::')[1]
36
+ type = subclass.downcase
32
37
 
33
- metadata[:type] = @file.class.name.split('::')[1].downcase
34
- metadata[:url] = @file.url
38
+ # In Jekyll v2, Page, Post and Document have their own class
39
+ return type if AlgoliaSearchUtils.restrict_jekyll_version(less_than: '3.0')
35
40
 
36
- metadata[:slug] = slug
41
+ # In Jekyll v3, Post are actually a specific type of Documents
42
+ if type == 'document'
43
+ collection_name = @file.collection.label
44
+ return 'post' if collection_name == 'posts'
45
+ end
37
46
 
38
- metadata[:posted_at] = @file.date.to_time.to_i if @file.respond_to? :date
39
- metadata[:tags] = tags
47
+ type
48
+ end
40
49
 
41
- metadata
50
+ ##
51
+ # Return the url of the page
52
+ def url
53
+ @file.url
42
54
  end
43
55
 
56
+ ##
57
+ # Return the title of the page
58
+ def title
59
+ @file.data['title']
60
+ end
61
+
62
+ ##
44
63
  # Returns the slug of the document
45
64
  def slug
46
- # Jekyll v3 has it in data
47
- return @file.data['slug'] if @file.data.key?('slug')
48
- # Old Jekyll v2 has it at the root
49
- return @file.slug if @file.respond_to? :slug
50
- # Otherwise, we guess it from the filename
65
+ # We can guess the slug from the filename for all documents
51
66
  basename = File.basename(@file.path)
52
67
  extname = File.extname(basename)
53
- File.basename(basename, extname)
54
- end
55
-
56
- # Extract a list of tags
57
- def tags
58
- tags = nil
68
+ slug = File.basename(basename, extname)
59
69
 
60
- # Jekyll v3 has it in data, while v2 have it at the root
61
- if @file.data.key?('tags')
62
- tags = @file.data['tags']
63
- elsif @file.respond_to? :tags
64
- tags = @file.tags
65
- end
66
-
67
- return tags if tags.nil?
70
+ # Jekyll v3 posts have it in data
71
+ return @file.data['slug'] if @file.data.key?('slug')
68
72
 
69
- # Anyway, we force cast it to string as some plugins will extend the tags to
70
- # full featured objects
71
- tags.map(&:to_s)
72
- end
73
+ # Jekyll v2 posts have a specific slug method
74
+ return @file.slug if @file.respond_to?(:slug)
73
75
 
74
- # Get the list of all HTML nodes to index
75
- def html_nodes
76
- document = Nokogiri::HTML(@file.content)
77
- document.css(@config['record_css_selector'])
76
+ slug
78
77
  end
79
78
 
80
- # Check if node is a heading
81
- def node_heading?(node)
82
- %w(h1 h2 h3 h4 h5 h6).include?(node.name)
83
- end
79
+ ##
80
+ # Get an array of tags of the document
81
+ def tags
82
+ tags = []
84
83
 
85
- # Get the closest heading parent
86
- def node_heading_parent(node, level = 'h7')
87
- # If initially called on a heading, we only accept stronger headings
88
- level = node.name if level == 'h7' && node_heading?(node)
84
+ is_v2 = AlgoliaSearchUtils.restrict_jekyll_version(less_than: '3.0')
85
+ is_v3 = AlgoliaSearchUtils.restrict_jekyll_version(more_than: '3.0')
86
+ has_tags_method = @file.respond_to?(:tags)
87
+ has_tags_data = @file.data.key?('tags')
89
88
 
90
- previous = node.previous_element
89
+ # Starting from Jekyll v3, all tags are in data['tags']
90
+ tags = @file.data['tags'] if is_v3 && has_tags_data
91
91
 
92
- # No previous element, we go up to the parent
93
- unless previous
94
- parent = node.parent
95
- # No more parent, then no heading found
96
- return nil if parent.name == 'body'
97
- return node_heading_parent(parent, level)
92
+ # In Jekyll v2, tags are in data['tags'], or in .tags
93
+ if is_v2
94
+ tags = @file.tags if has_tags_method
95
+ tags = @file.data['tags'] if tags.empty? && has_tags_data
98
96
  end
99
97
 
100
- # This is a heading, we return it
101
- return previous if node_heading?(previous) && previous.name < level
102
-
103
- node_heading_parent(previous, level)
98
+ # Some extension extends the tags with custom classes, so we make sure we
99
+ # cast them as strings
100
+ tags.map(&:to_s)
104
101
  end
105
102
 
106
- # Get all the parent headings of the specified node
107
- # If the node itself is a heading, we include it
108
- def node_hierarchy(node, state = { level: 7 })
109
- tag_name = node.name
110
- level = tag_name.delete('h').to_i
103
+ ##
104
+ # Get the post date timestamp
105
+ def date
106
+ return nil unless @file.respond_to?(:date)
111
107
 
112
- if node_heading?(node) && level < state[:level]
113
- state[tag_name.to_sym] = node_text(node)
114
- state[:level] = level
115
- end
116
-
117
- heading = node_heading_parent(node)
118
-
119
- # No previous heading, we can stop the recursion
120
- unless heading
121
- state.delete(:level)
122
- return state
123
- end
124
-
125
- node_hierarchy(heading, state)
108
+ @file.date.to_time.to_i
126
109
  end
127
110
 
128
- # Return the raw HTML of the element to index
129
- def node_raw_html(node)
130
- node.to_s
131
- end
111
+ ##
112
+ # Get the collection name of a document
113
+ def collection
114
+ return nil unless @file.respond_to?(:collection)
132
115
 
133
- # Return the text of the element, sanitized to be displayed
134
- def node_text(node)
135
- node.content.gsub('<', '&lt;').gsub('>', '&gt;')
136
- end
116
+ collection_name = @file.collection.label
137
117
 
138
- # Returns a unique string of hierarchy from title to h6, used for distinct
139
- def unique_hierarchy(data)
140
- headings = %w(title h1 h2 h3 h4 h5 h6)
141
- headings.map { |heading| data[heading.to_sym] }.compact.join(' > ')
118
+ # In Jekyll v3, posts are actually a collection
119
+ return nil if collection_name == 'posts'
120
+ collection_name
142
121
  end
143
122
 
144
- # Returns a hash of two CSS selectors. One for the node itself, and one its
145
- # closest heading parent
146
- def node_css_selector(node)
147
- return nil if node.nil?
123
+ ##
124
+ # Get a hash of all front-matter data
125
+ def front_matter
126
+ raw_data = @file.data
148
127
 
149
- # Use the CSS id if one is set
150
- return "##{node['id']}" if node['id']
151
-
152
- # Default Nokogiri selector
153
- node.css_path.gsub('html > body > ', '')
154
- end
128
+ # We clean some keys that will be handled by specific methods
129
+ attributes_to_remove = %w(title tags slug url date type)
130
+ attributes_to_remove.each do |attribute|
131
+ raw_data.delete(attribute)
132
+ end
155
133
 
156
- # The more words are in common between this node and its parent heading, the
157
- # higher the score
158
- def weight_heading_relevance(data)
159
- # Get list of unique words in headings
160
- title_words = %i(title h1 h2 h3 h4 h5 h6)
161
- .select { |title| data.key?(title) }
162
- .map { |title| data[title].to_s.split(/\W+/) }
163
- .flatten
164
- .compact
165
- .map(&:downcase)
166
- .uniq
167
- # Intersect words in headings with words in test
168
- text_words = data[:text].downcase.split(/\W+/)
169
- (title_words & text_words).size
170
- end
134
+ # Convert to symbols
135
+ data = {}
136
+ raw_data.each do |key, value|
137
+ data[key.to_sym] = value
138
+ end
171
139
 
172
- # Returns a weight based on the tag_name
173
- def weight_tag_name(item)
174
- tag_name = item[:tag_name]
175
- # No a heading, no weight
176
- return 0 unless %w(h1 h2 h3 h4 h5 h6).include?(tag_name)
177
- # h1: 100, h2: 90, ..., h6: 50
178
- 100 - (tag_name.delete('h').to_i - 1) * 10
140
+ data
179
141
  end
180
142
 
181
- # Returns an object of all weights
182
- def weight(item, index)
183
- {
184
- tag_name: weight_tag_name(item),
185
- heading_relevance: weight_heading_relevance(item),
186
- position: index
143
+ ##
144
+ # Get the list of all node data
145
+ def hierarchy_nodes
146
+ extractor_options = {
147
+ css_selector: @config['record_css_selector']
187
148
  }
149
+
150
+ HTMLHierarchyExtractor.new(
151
+ @file.content,
152
+ options: extractor_options
153
+ ).extract
188
154
  end
189
155
 
156
+ # Extract all records from the page and return the list
190
157
  def extract
158
+ # Getting all hierarchical nodes from the HTML input
159
+ raw_items = hierarchy_nodes
160
+
161
+ # Shared attributes relative to the page that all records will have
162
+ shared_attributes = {
163
+ type: type,
164
+ url: url,
165
+ title: title,
166
+ slug: slug,
167
+ date: date,
168
+ collection: collection,
169
+ tags: tags
170
+ }
171
+ # Remove empty attributes
172
+ shared_attributes = shared_attributes.delete_if do |_, value|
173
+ value.nil?
174
+ end
175
+
176
+ # Enriching with page metadata
191
177
  items = []
192
- html_nodes.each_with_index do |node, index|
193
- next if node.text.empty?
194
-
195
- item = metadata.clone
196
- item.merge!(node_hierarchy(node))
197
- item[:tag_name] = node.name
198
- item[:raw_html] = node_raw_html(node)
199
- item[:text] = node_text(node)
200
- item[:unique_hierarchy] = unique_hierarchy(item)
201
- item[:css_selector] = node_css_selector(node)
202
- item[:css_selector_parent] = node_css_selector(node_heading_parent(node))
203
- item[:weight] = weight(item, index)
204
-
205
- # We pass item through the user defined custom hook
206
- item = custom_hook_each(item, node)
178
+ raw_items.each do |raw_item|
179
+ nokogiri_node = raw_item[:node]
180
+ raw_item.delete(:node)
181
+ item = shared_attributes.merge(raw_item)
182
+ item[:objectID] = item[:uuid]
183
+ item.delete(:uuid)
184
+
185
+ item = custom_hook_each(item, nokogiri_node)
207
186
  next if item.nil?
208
187
 
209
188
  items << item
210
189
  end
190
+
211
191
  custom_hook_all(items)
212
192
  end
213
193
  end