RubyGems - algoliasearch-jekyll - Versions diffs - 0.9.1 → 1.0.0.beta.pre.1 - Mend

algoliasearch-jekyll 0.9.1 → 1.0.0.beta.pre.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (47) hide show

checksums.yaml +4 -4
data/.travis.yml +3 -4
data/CONTRIBUTING.md +8 -1
data/Gemfile +4 -5
data/README.md +318 -11
data/Rakefile +7 -12
data/algoliasearch-jekyll.gemspec +66 -62
data/gemfiles/jekyll_v2.gemfile +3 -3
data/gemfiles/jekyll_v3.gemfile +4 -4
data/gemfiles/jekyll_v3_1_3.gemfile +24 -0
data/gemfiles/jekyll_v3_1_6.gemfile +24 -0
data/lib/algoliasearch-jekyll.rb +1 -3
data/lib/credential_checker.rb +2 -1
data/lib/error_handler.rb +6 -0
data/lib/push.rb +81 -19
data/lib/record_extractor.rb +120 -140
data/lib/utils.rb +13 -0
data/lib/version.rb +1 -1
data/scripts/release +13 -12
data/scripts/test_v3 +1 -1
data/scripts/watch +4 -0
data/spec/error_handler_spec.rb +17 -0
data/spec/fixtures/jekyll_version_2/404.html +8 -0
data/spec/fixtures/jekyll_version_2/404.md +9 -0
data/spec/fixtures/jekyll_version_2/_my-collection/collection-item.md +3 -0
data/spec/fixtures/jekyll_version_2/_posts/2015-07-02-test-post.md +1 -1
data/spec/fixtures/jekyll_version_2/about.md +3 -0
data/spec/fixtures/jekyll_version_2/front_matter.md +15 -0
data/spec/fixtures/jekyll_version_2/index.html +3 -1
data/spec/fixtures/jekyll_version_2/only-divs.md +15 -0
data/spec/fixtures/jekyll_version_2/only-paragraphs.md +15 -0
data/spec/fixtures/jekyll_version_3/404.html +8 -0
data/spec/fixtures/jekyll_version_3/404.md +9 -0
data/spec/fixtures/jekyll_version_3/_config.yml +1 -1
data/spec/fixtures/jekyll_version_3/_my-collection/collection-item.md +3 -0
data/spec/fixtures/jekyll_version_3/_posts/2015-07-02-test-post.md +1 -1
data/spec/fixtures/jekyll_version_3/about.md +3 -0
data/spec/fixtures/jekyll_version_3/front_matter.md +15 -0
data/spec/fixtures/jekyll_version_3/index.html +4 -1
data/spec/fixtures/jekyll_version_3/only-divs.md +15 -0
data/spec/fixtures/jekyll_version_3/only-paragraphs.md +15 -0
data/spec/push_spec.rb +211 -8
data/spec/record_extractor_spec.rb +296 -358
data/spec/spec_helper.rb +32 -11
data/txt/record_too_big +19 -0
metadata +40 -51
data/scripts/watch +0 -1

data/spec/record_extractor_spec.rb CHANGED

@@ -3,643 +3,581 @@ require 'spec_helper'
 describe(AlgoliaSearchRecordExtractor) do
   let(:extractor) { AlgoliaSearchRecordExtractor }
   let(:site) { get_site }
-  let(:page_file) { extractor.new(site.file_by_name('about.md')) }
-  let(:html_page_file) { extractor.new(site.file_by_name('authors.html')) }
-  let(:post_file) { extractor.new(site.file_by_name('test-post.md')) }
-  let(:hierarchy_page_file) { extractor.new(site.file_by_name('hierarchy.md')) }
-  let(:weight_page_file) { extractor.new(site.file_by_name('weight.md')) }
-  let(:document_file) { extractor.new(site.file_by_name('collection-item.md')) }
+  let(:fixture_page) { extractor.new(site.file_by_name('about.md')) }
+  let(:fixture_post) { extractor.new(site.file_by_name('test-post.md')) }
+  let(:fixture_document) do
+    extractor.new(site.file_by_name('collection-item.md'))
+  end
+  let(:fixture_only_paragraphs) do
+    extractor.new(site.file_by_name('only-paragraphs.md'))
+  end
+  let(:fixture_front_matter) do
+    extractor.new(site.file_by_name('front_matter.md'))
+  end
   before(:each) do
-    # Disabling the logs, while still allowing to spy them
-    Jekyll.logger = double('Specific Mock Logger').as_null_object
-    @logger = Jekyll.logger.writer
+    mock_logger
   end
-  describe 'metadata' do
-    it 'gets metadata from page' do
+  describe 'type' do
+    it 'should recognize a page' do
       # Given
-      actual = page_file.metadata
-      # Then
-      expect(actual[:type]).to eq 'page'
-      expect(actual[:slug]).to eq 'about'
-      expect(actual[:title]).to eq 'About page'
-      expect(actual[:url]).to eq '/about.html'
-      expect(actual[:custom]).to eq 'Foo'
-    end
+      input = fixture_page
-    it 'gets metadata from post' do
-      # Given
-      actual = post_file.metadata
+      # When
+      actual = input.type
-      # Then
-      expect(actual[:slug]).to eq 'test-post'
-      expect(actual[:title]).to eq 'Test post'
-      expect(actual[:url]).to eq '/2015/07/02/test-post.html'
-      expect(actual[:posted_at]).to eq 1_435_788_000
-      expect(actual[:custom]).to eq 'Foo'
+      expect(actual).to eq 'page'
     end
-    it 'gets posted_at timestamp based on the configured timezone' do
+    it 'should recognize a post' do
       # Given
-      site = get_site(timezone: 'America/New_York')
-      post_file = extractor.new(site.file_by_name('test-post.md'))
-      actual = post_file.metadata
-      # Then
-      expect(actual[:posted_at]).to eq 1_435_809_600
-    end
+      input = fixture_post
-    it 'gets metadata from document' do
-      # Given
-      actual = document_file.metadata
+      # When
+      actual = input.type
-      # Then
-      expect(actual[:type]).to eq 'document'
-      expect(actual[:slug]).to eq 'collection-item'
-      expect(actual[:title]).to eq 'Collection Item'
-      expect(actual[:url]).to eq '/my-collection/collection-item.html'
-      expect(actual[:custom]).to eq 'Foo'
+      expect(actual).to eq 'post'
     end
-    if restrict_jekyll_version(more_than: '3.0')
-      describe 'Jekyll > 3.0' do
-        it 'should not throw any deprecation warnings' do
-          # Given
-          # When
-          post_file.metadata
+    it 'should recognize a document' do
+      # Given
+      input = fixture_document
-          # Expect
-          expect(@logger).to_not have_received(:warn)
-        end
-      end
+      # When
+      actual = input.type
+      expect(actual).to eq 'document'
     end
   end
-  describe 'slug' do
-    it 'gets it from data if available' do
+  describe 'url' do
+    it 'should use the page url' do
       # Given
-      post_file.file.data['slug'] = 'foo'
-      allow(post_file.file).to receive(:respond_to?).with(:slug) do
-        false
-      end
+      input = fixture_page
       # When
-      actual = post_file.slug
+      actual = input.url
-      # Then
-      expect(actual).to eql('foo')
+      expect(actual).to eq '/about.html'
     end
-    it 'gets it from the root if not in data' do
+    it 'should use the post url' do
       # Given
-      post_file.file.data.delete 'slug'
-      allow(post_file.file).to receive(:slug).and_return('foo')
+      input = fixture_post
       # When
-      actual = post_file.slug
+      actual = input.url
-      # Then
-      expect(actual).to eql('foo')
+      expect(actual).to eq '/2015/07/02/test-post.html'
     end
-    it 'gets it from the data even if in the root' do
+    it 'should use the document url' do
       # Given
-      post_file.file.data['slug'] = 'foo'
-      allow(post_file.file).to receive(:slug).and_return('bar')
+      input = fixture_document
       # When
-      actual = post_file.slug
+      actual = input.url
-      # Then
-      expect(actual).to eql('foo')
+      expect(actual).to eq '/my-collection/collection-item.html'
     end
+  end
-    it 'guesses it from the path if not found' do
+  describe 'title' do
+    it 'should use the page title' do
       # Given
-      post_file.file.data.delete 'slug'
-      allow(post_file.file).to receive(:respond_to?).with(:slug) do
-        false
-      end
-      allow(post_file.file).to receive(:path) do
-        '/path/to/file/foo.html'
-      end
+      input = fixture_page
       # When
-      actual = post_file.slug
+      actual = input.title
-      # # Then
-      expect(actual).to eql('foo')
+      expect(actual).to eq 'About page'
     end
-  end
-  describe 'tags' do
-    it 'returns tags in data if available' do
+    it 'should use the post title' do
       # Given
-      post_file.file.data['tags'] = %w(foo bar)
-      allow(post_file.file).to receive(:respond_to?).with(:tags) do
-        false
-      end
+      input = fixture_post
       # When
-      actual = post_file.tags
+      actual = input.title
-      # Then
-      expect(actual).to include('foo', 'bar')
+      expect(actual).to eq 'Test post'
     end
-    it 'returns tags at the root if not in data' do
+    it 'should use the document title' do
       # Given
-      post_file.file.data.delete 'tags'
-      allow(post_file.file).to receive(:tags).and_return(%w(foo bar))
+      input = fixture_document
       # When
-      actual = post_file.tags
+      actual = input.title
-      # Then
-      expect(actual).to include('foo', 'bar')
+      expect(actual).to eq 'Collection Item'
+    end
+  end
+  describe 'slug' do
+    if restrict_jekyll_version(more_than: '3.0')
+      it 'should not throw a deprecation warning' do
+        # Given
+        input = fixture_post
+        # When
+        input.slug
+        # Then
+        expect(Jekyll.logger)
+          .to_not have_received(:warn).with('Deprecation:', any_args)
+      end
     end
-    it 'returns tags in data even if in root' do
+    it 'should get it for a page' do
       # Given
-      post_file.file.data['tags'] = %w(foo bar)
-      allow(post_file.file).to receive(:tags).and_return(%w(js css))
+      input = fixture_page
       # When
-      actual = post_file.tags
+      actual = input.slug
-      # Then
-      expect(actual).to include('foo', 'bar')
+      expect(actual).to eq 'about'
     end
-    it 'parses tags as string if they are another type' do
+    it 'should get it for a post' do
       # Given
-      tag_foo = double('Extended Tag', to_s: 'foo')
-      tag_bar = double('Extended Tag', to_s: 'bar')
-      post_file.file.data['tags'] = [tag_foo, tag_bar]
-      allow(post_file.file).to receive(:respond_to?).with(:tags) do
-        false
-      end
+      input = fixture_post
       # When
-      actual = post_file.tags
+      actual = input.slug
-      # Then
-      expect(actual).to include('foo', 'bar')
+      expect(actual).to eq 'test-post'
     end
-    it 'extract tags from front matter' do
+    it 'should get it for a document' do
       # Given
-      actual = post_file.tags
+      input = fixture_document
-      # Then
-      expect(actual).to include('tag', 'another tag')
+      # When
+      actual = input.slug
+      expect(actual).to eq 'collection-item'
     end
   end
-  describe 'html_nodes' do
-    it 'returns the list of all <p> by default' do
-      expect(page_file.html_nodes.size).to eq 6
-    end
+  describe 'tags' do
+    if restrict_jekyll_version(more_than: '3.0')
+      it 'should not throw a deprecation warning' do
+        # Given
+        input = fixture_post
-    it 'allow _config.yml to override the selector' do
-      # Given
-      site = get_site(algolia: { 'record_css_selector' => 'p,ul' })
-      page_file = extractor.new(site.file_by_name('about.md'))
+        # When
+        input.tags
-      expect(page_file.html_nodes.size).to eq 7
+        # Then
+        expect(Jekyll.logger)
+          .to_not have_received(:warn).with('Deprecation:', any_args)
+      end
     end
-  end
-  describe 'node_heading_parent' do
-    it 'returns the direct heading right above' do
+    it 'should get tags from page' do
       # Given
-      nodes = hierarchy_page_file.html_nodes
-      p = nodes[0]
+      input = fixture_page
       # When
-      actual = hierarchy_page_file.node_heading_parent(p)
+      actual = input.tags
-      # Then
-      expect(actual.name).to eq 'h1'
-      expect(actual.text).to eq 'H1'
+      expect(actual).to include('tag', 'another tag')
     end
-    it 'returns the closest heading even if in a sub tag' do
+    it 'should get tags from post' do
       # Given
-      nodes = hierarchy_page_file.html_nodes
-      p = nodes[2]
+      input = fixture_post
       # When
-      actual = hierarchy_page_file.node_heading_parent(p)
+      actual = input.tags
-      # Then
-      expect(actual.name).to eq 'h2'
-      expect(actual.text).to eq 'H2A'
+      expect(actual).to include('tag', 'another tag')
     end
-    it 'should automatically go up one level when indexing headings' do
+    it 'should get tags from document' do
       # Given
-      site = get_site(algolia: { 'record_css_selector' => 'p,h2' })
-      hierarchy_page_file = extractor.new(site.file_by_name('hierarchy.md'))
-      nodes = hierarchy_page_file.html_nodes
-      h2 = nodes[4]
+      input = fixture_document
       # When
-      actual = hierarchy_page_file.node_heading_parent(h2)
+      actual = input.tags
-      # Then
-      expect(actual.name).to eq 'h1'
-      expect(actual.text).to eq 'H1'
+      expect(actual).to include('tag', 'another tag')
     end
-    it 'should find the correct parent when indexing deep headings' do
+    it 'should handle custom extended tags' do
       # Given
-      site = get_site(algolia: { 'record_css_selector' => 'h2' })
-      hierarchy_page_file = extractor.new(site.file_by_name('hierarchy.md'))
-      nodes = hierarchy_page_file.html_nodes
-      h2 = nodes[2]
+      extended_tags = [
+        double('Extended Tag', to_s: 'extended tag'),
+        double('Extended Tag', to_s: 'extended another tag')
+      ]
+      input = fixture_post
+      # Overwrite string tags with more advanced ones
+      if restrict_jekyll_version(less_than: '3.0')
+        allow(input.file).to receive(:tags) { extended_tags }
+      else
+        input.file.data['tags'] = extended_tags
+      end
       # When
-      actual = hierarchy_page_file.node_heading_parent(h2)
+      actual = input.tags
-      # Then
-      expect(actual.name).to eq 'h1'
-      expect(actual.text).to eq 'H1'
+      expect(actual).to include('extended tag', 'extended another tag')
     end
   end
-  describe 'node_hierarchy' do
-    it 'returns the unique parent of a simple element' do
-      # Note: First <p> should only have a h1 as hierarchy
+  describe 'date' do
+    it 'should get the date as a timestamp for posts' do
       # Given
-      nodes = hierarchy_page_file.html_nodes
-      p = nodes[0]
+      input = fixture_post
       # When
-      actual = hierarchy_page_file.node_hierarchy(p)
+      actual = input.date
       # Then
-      expect(actual).to include(h1: 'H1')
+      expect(actual).to eq 1_435_788_000
     end
-    it 'returns the heading hierarchy of multiple headings' do
-      # Note: 5th <p> is inside h3, second h2 and main h1
+    it 'should be nil for pages' do
       # Given
-      nodes = hierarchy_page_file.html_nodes
-      p = nodes[4]
+      input = fixture_page
       # When
-      actual = hierarchy_page_file.node_hierarchy(p)
+      actual = input.date
       # Then
-      expect(actual).to include(h1: 'H1', h2: 'H2B', h3: 'H3A')
+      expect(actual).to eq nil
     end
-    it 'works even if heading not on the same level' do
-      # Note: The 6th <p> is inside a div
+    it 'should generate the timestamp relative to the configured timezone' do
       # Given
-      nodes = hierarchy_page_file.html_nodes
-      p = nodes[5]
+      site = get_site(timezone: 'America/New_York')
+      input = extractor.new(site.file_by_name('test-post.md'))
       # When
-      actual = hierarchy_page_file.node_hierarchy(p)
+      actual = input.date
       # Then
-      expect(actual).to include(h1: 'H1', h2: 'H2B', h3: 'H3A', h4: 'H4')
+      expect(actual).to eq 1_435_809_600
     end
+  end
-    it 'includes node in the output if headings are indexed' do
+  describe 'collection' do
+    it 'should get the collection name for documents' do
       # Given
-      site = get_site(algolia: { 'record_css_selector' => 'h1' })
-      hierarchy_page_file = extractor.new(site.file_by_name('hierarchy.md'))
-      nodes = hierarchy_page_file.html_nodes
-      h1 = nodes[0]
+      input = fixture_document
       # When
-      actual = hierarchy_page_file.node_hierarchy(h1)
+      actual = input.collection
       # Then
-      expect(actual).to include(h1: 'H1')
+      expect(actual).to eq 'my-collection'
     end
-    it 'escape html in headings' do
+    it 'should be nil for pages' do
       # Given
-      nodes = hierarchy_page_file.html_nodes
-      p = nodes[7]
+      input = fixture_page
       # When
-      actual = hierarchy_page_file.node_hierarchy(p)
+      actual = input.collection
       # Then
-      expect(actual).to include(h3: 'H3B &lt;code&gt;')
+      expect(actual).to eq nil
     end
-  end
-  describe 'node_raw_html' do
-    it 'returns html including surrounding tags' do
-      # Note: 3rd <p> is a real HTML with a custom class
+    it 'should be nil for posts' do
       # Given
-      nodes = page_file.html_nodes
-      p = nodes[3]
+      input = fixture_post
       # When
-      actual = page_file.node_raw_html(p)
+      actual = input.collection
       # Then
-      expect(actual).to eq '<p id="text4">Another text 4</p>'
+      expect(actual).to eq nil
     end
   end
-  describe 'node_text' do
-    it 'returns inner text with <> escaped' do
-      # Note: 4th <p> contains a <code> tag with <>
+  describe 'front_matter' do
+    it 'should get a hash of all front matter data' do
       # Given
-      nodes = page_file.html_nodes
-      p = nodes[4]
+      input = fixture_front_matter
       # When
-      actual = page_file.node_text(p)
+      actual = input.front_matter
       # Then
-      expect(actual).to eq 'Another &lt;text&gt; 5'
+      expect(actual[:author]).to eq 'John Doe'
+      expect(actual[:custom]).to eq 'foo'
     end
-  end
-  describe 'unique_hierarchy' do
-    it 'combines title and headings' do
+    it 'should remove known keys from the front-matter' do
       # Given
-      hierarchy = {
-        title: 'title',
-        h1: 'h1',
-        h2: 'h2',
-        h3: 'h3',
-        h4: 'h4',
-        h5: 'h5',
-        h6: 'h6'
-      }
+      input = fixture_front_matter
       # When
-      actual = page_file.unique_hierarchy(hierarchy)
+      actual = input.front_matter
       # Then
-      expect(actual).to eq 'title > h1 > h2 > h3 > h4 > h5 > h6'
+      expect(actual[:title]).to eq nil
+      expect(actual[:tags]).to eq nil
+      expect(actual[:slug]).to eq nil
+      expect(actual[:url]).to eq nil
+      expect(actual[:date]).to eq nil
+      expect(actual[:type]).to eq nil
     end
-    it 'combines title and headings even with missing elements' do
+    it 'should cast keys as symbols' do
       # Given
-      hierarchy = {
-        title: 'title',
-        h2: 'h2',
-        h4: 'h4',
-        h6: 'h6'
-      }
+      input = fixture_front_matter
       # When
-      actual = page_file.unique_hierarchy(hierarchy)
+      actual = input.front_matter
       # Then
-      expect(actual).to eq 'title > h2 > h4 > h6'
+      expect(actual['custom']).to eq nil
+      expect(actual[:custom]).to_not eq nil
+      expect(actual['author']).to eq nil
+      expect(actual[:author]).to_not eq nil
     end
   end
-  describe 'node_css_selector' do
-    it 'uses the #id to make the selector more precise if one is found' do
+  describe 'extract' do
+    it 'should get one item per node' do
       # Given
-      nodes = page_file.html_nodes
-      p = nodes[3]
+      input = fixture_only_paragraphs
       # When
-      actual = page_file.node_css_selector(p)
+      actual = input.extract
       # Then
-      expect(actual).to eq '#text4'
+      expect(actual.size).to eq 6
     end
-    it 'uses p:nth-of-type if no #id found' do
+    it 'should get a complete record' do
       # Given
-      nodes = page_file.html_nodes
-      p = nodes[2]
+      input = fixture_page
       # When
-      actual = page_file.node_css_selector(p)
+      actual = input.extract
       # Then
-      expect(actual).to eq 'p:nth-of-type(3)'
+      # Jekyll auto-generates anchors on heading
+      expect(actual[0][:anchor]).to eq 'heading-1'
+      # It's a page, so no date
+      expect(actual[0][:date]).to eq nil
+      # Hierarchy on first level
+      expect(actual[0][:hierarchy][:lvl0]).to eq 'Heading 1'
+      expect(actual[0][:hierarchy][:lvl1]).to eq nil
+      # Node content
+      expect(actual[0][:tag_name]).to eq 'p'
+      expect(actual[0][:html]).to eq '<p>Text 1</p>'
+      expect(actual[0][:text]).to eq 'Text 1'
+      # Page
+      expect(actual[0][:title]).to eq 'About page'
+      expect(actual[0][:slug]).to eq 'about'
+      expect(actual[0][:url]).to eq '/about.html'
+      # Tags
+      expect(actual[0][:tags]).to eq ['tag', 'another tag']
+      # Weight
+      expect(actual[0][:weight][:heading]).to eq 90
+      expect(actual[0][:weight][:position]).to eq 0
     end
-    it 'handles custom <div> markup' do
+    it 'should allow overriding the node selector' do
       # Given
-      nodes = page_file.html_nodes
-      p = nodes[5]
+      site = get_site(algolia: { 'record_css_selector' => 'div' })
+      input = extractor.new(site.file_by_name('only-divs.md'))
       # When
-      actual = page_file.node_css_selector(p)
+      actual = input.extract
       # Then
-      expect(actual).to eq 'div:nth-of-type(2) > p'
+      expect(actual.size).to eq 6
     end
-  end
-  describe 'weight_heading_relevance' do
-    it 'gets the number of words in text also in the title' do
+    it 'should contain all the basic top level info' do
       # Given
-      data = {
-        title: 'foo bar',
-        text: 'Lorem ipsum dolor foo bar, consectetur adipiscing elit'
-      }
+      input = fixture_page
+      allow(input).to receive(:date) { 'mock_date' }
+      allow(input).to receive(:slug) { 'mock_slug' }
+      allow(input).to receive(:tags) { 'mock_tags' }
+      allow(input).to receive(:title) { 'mock_title' }
+      allow(input).to receive(:url) { 'mock_url' }
+      allow(input).to receive(:type) { 'mock_type' }
       # When
-      actual = page_file.weight_heading_relevance(data)
+      actual = input.extract
       # Then
-      expect(actual).to eq 2
+      expect(actual[0][:date]).to eq 'mock_date'
+      expect(actual[0][:slug]).to eq 'mock_slug'
+      expect(actual[0][:tags]).to eq 'mock_tags'
+      expect(actual[0][:title]).to eq 'mock_title'
+      expect(actual[0][:url]).to eq 'mock_url'
+      expect(actual[0][:type]).to eq 'mock_type'
     end
-    it 'gets the number of words in text also in the headings' do
+    it 'should add node data from extractor' do
       # Given
-      data = {
-        title: 'foo',
-        h1: 'bar',
-        h2: 'baz',
-        text: 'Lorem baz dolor foo bar, consectetur adipiscing elit'
-      }
+      input = fixture_page
+      allow(input).to receive(:hierarchy_nodes) do
+        [
+          { name: 'foo' },
+          { name: 'bar' }
+        ]
+      end
       # When
-      actual = page_file.weight_heading_relevance(data)
+      actual = input.extract
       # Then
-      expect(actual).to eq 3
+      expect(actual[0][:name]).to eq 'foo'
     end
-    it 'count each word only once' do
+    it 'should not expose the HTML node' do
       # Given
-      data = {
-        title: 'foo',
-        h1: 'foo foo foo',
-        h2: 'bar bar foo bar',
-        text: 'foo bar bar bar bar baz foo bar baz'
-      }
+      input = fixture_only_paragraphs
       # When
-      actual = page_file.weight_heading_relevance(data)
+      actual = input.extract
       # Then
-      expect(actual).to eq 2
+      expect(actual[0][:node]).to eq nil
     end
-    it 'is case-insensitive' do
+    it 'should set the objectID as a hash' do
       # Given
-      data = {
-        title: 'FOO',
-        h1: 'bar Bar BAR',
-        text: 'foo BAR'
-      }
+      input = fixture_page
       # When
-      actual = page_file.weight_heading_relevance(data)
+      actual = input.extract
       # Then
-      expect(actual).to eq 2
+      expect(actual[0]).not_to have_key(:uuid)
+      expect(actual[0]).to have_key(:objectID)
     end
-    it 'should only use words, no partial matches' do
+    it 'should not contain a collection key for pages' do
       # Given
-      data = {
-        title: 'foo bar',
-        text: 'xxxfooxxx bar'
-      }
+      input = fixture_page
       # When
-      actual = page_file.weight_heading_relevance(data)
+      actual = input.extract
       # Then
-      expect(actual).to eq 1
+      expect(actual[0]).not_to have_key(:collection)
     end
-    it 'should still work with non-string keys' do
+    it 'should not contain a collection key for posts' do
       # Given
-      data = {
-        title: nil,
-        h1: [],
-        h2: {},
-        h3: true,
-        h4: false,
-        h5: 'foo bar',
-        text: 'foo bar'
-      }
+      input = fixture_post
       # When
-      actual = page_file.weight_heading_relevance(data)
+      actual = input.extract
       # Then
-      expect(actual).to eq 2
+      expect(actual[0]).not_to have_key(:collection)
     end
-  end
-  describe 'weight_tag_name' do
-    it 'gives a score of 0 to non-headings' do
+    it 'should contain the collection name for documents' do
       # Given
-      data = {
-        tag_name: 'p'
-      }
+      page = fixture_document
       # When
-      actual = page_file.weight_tag_name(data)
+      page_data = page.extract
       # Then
-      expect(actual).to eq 0
+      expect(page_data[0][:collection]).to eq 'my-collection'
     end
-    it 'gives a score of 100 to h1' do
+    it 'should not contain a date key for pages' do
       # Given
-      data = {
-        tag_name: 'h1'
-      }
+      input = fixture_page
       # When
-      actual = page_file.weight_tag_name(data)
+      actual = input.extract
       # Then
-      expect(actual).to eq 100
+      expect(actual[0]).not_to have_key(:date)
     end
-    it 'gives a score of 40 to h6' do
+  end
+  describe 'custom_hook_each' do
+    it 'should be called on every item' do
       # Given
-      data = {
-        tag_name: 'h6'
-      }
+      input = fixture_page
+      allow(input).to receive(:custom_hook_each).and_call_original
       # When
-      actual = page_file.weight_tag_name(data)
+      actual = input.extract
       # Then
-      expect(actual).to eq 50
+      expect(input).to have_received(:custom_hook_each)
+        .exactly(actual.size).times
     end
-  end
-  describe 'weight' do
-    it 'returns an object with all weights' do
+    it 'should let users change the item' do
       # Given
-      item = {
-        tag_name: 'p'
-      }
-      allow(page_file).to receive(:weight_tag_name) { 10 }
-      allow(page_file).to receive(:weight_heading_relevance) { 20 }
+      input = fixture_page
+      def input.custom_hook_each(item, _)
+        item['foo'] = 'bar'
+        item
+      end
       # When
-      actual = page_file.weight(item, 42)
+      actual = input.extract
       # Then
-      expect(actual).to include(tag_name: 10)
-      expect(actual).to include(heading_relevance: 20)
-      expect(actual).to include(position: 42)
+      expect(actual[0]['foo']).to eq 'bar'
     end
-  end
-  describe 'custom_hook_each' do
-    it 'let the user call a custom hook to modify a record' do
+    it 'should let a user remove an item by returning nil' do
       # Given
-      def page_file.custom_hook_each(item, _)
-        item[:custom_attribute] = 'foo'
-        item
+      input = fixture_page
+      def input.custom_hook_each(_, _)
+        nil
       end
       # When
-      actual = page_file.extract
+      actual = input.extract
       # Then
-      expect(actual[0]).to include(custom_attribute: 'foo')
+      expect(actual.size).to eq 0
     end
-    it 'let the user discard a record by returning nil' do
+    it 'should be passed the Nokogiri node as second argument' do
       # Given
-      def page_file.custom_hook_each(_, _)
-        nil
+      input = fixture_page
+      def input.custom_hook_each(item, nokogiri_node)
+        item['foo'] = nokogiri_node
+        item
       end
       # When
-      actual = page_file.extract
+      actual = input.extract
       # Then
-      expect(actual.size).to eq 0
+      expect(actual[0]['foo']).to be_an(Nokogiri::XML::Element)
     end
   end
   describe 'custom_hook_all' do
-    it 'let the user call a custom hook to modify the list of records' do
+    it 'should let the user update the list of records' do
       # Given
-      def page_file.custom_hook_all(items)
-        [items[0], { foo: 'bar' }]
+      input = fixture_page
+      def input.custom_hook_all(_)
+        [{
+          'foo' => 'bar'
+        }]
       end
       # When
-      actual = page_file.extract
+      actual = input.extract
       # Then
-      expect(actual.size).to eq 2
-      expect(actual[1]).to include(foo: 'bar')
+      expect(actual[0]['foo']).to eq 'bar'
     end
   end
 end