algoliasearch-jekyll 0.9.1 → 1.0.0.beta.pre.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (47) hide show
  1. checksums.yaml +4 -4
  2. data/.travis.yml +3 -4
  3. data/CONTRIBUTING.md +8 -1
  4. data/Gemfile +4 -5
  5. data/README.md +318 -11
  6. data/Rakefile +7 -12
  7. data/algoliasearch-jekyll.gemspec +66 -62
  8. data/gemfiles/jekyll_v2.gemfile +3 -3
  9. data/gemfiles/jekyll_v3.gemfile +4 -4
  10. data/gemfiles/jekyll_v3_1_3.gemfile +24 -0
  11. data/gemfiles/jekyll_v3_1_6.gemfile +24 -0
  12. data/lib/algoliasearch-jekyll.rb +1 -3
  13. data/lib/credential_checker.rb +2 -1
  14. data/lib/error_handler.rb +6 -0
  15. data/lib/push.rb +81 -19
  16. data/lib/record_extractor.rb +120 -140
  17. data/lib/utils.rb +13 -0
  18. data/lib/version.rb +1 -1
  19. data/scripts/release +13 -12
  20. data/scripts/test_v3 +1 -1
  21. data/scripts/watch +4 -0
  22. data/spec/error_handler_spec.rb +17 -0
  23. data/spec/fixtures/jekyll_version_2/404.html +8 -0
  24. data/spec/fixtures/jekyll_version_2/404.md +9 -0
  25. data/spec/fixtures/jekyll_version_2/_my-collection/collection-item.md +3 -0
  26. data/spec/fixtures/jekyll_version_2/_posts/2015-07-02-test-post.md +1 -1
  27. data/spec/fixtures/jekyll_version_2/about.md +3 -0
  28. data/spec/fixtures/jekyll_version_2/front_matter.md +15 -0
  29. data/spec/fixtures/jekyll_version_2/index.html +3 -1
  30. data/spec/fixtures/jekyll_version_2/only-divs.md +15 -0
  31. data/spec/fixtures/jekyll_version_2/only-paragraphs.md +15 -0
  32. data/spec/fixtures/jekyll_version_3/404.html +8 -0
  33. data/spec/fixtures/jekyll_version_3/404.md +9 -0
  34. data/spec/fixtures/jekyll_version_3/_config.yml +1 -1
  35. data/spec/fixtures/jekyll_version_3/_my-collection/collection-item.md +3 -0
  36. data/spec/fixtures/jekyll_version_3/_posts/2015-07-02-test-post.md +1 -1
  37. data/spec/fixtures/jekyll_version_3/about.md +3 -0
  38. data/spec/fixtures/jekyll_version_3/front_matter.md +15 -0
  39. data/spec/fixtures/jekyll_version_3/index.html +4 -1
  40. data/spec/fixtures/jekyll_version_3/only-divs.md +15 -0
  41. data/spec/fixtures/jekyll_version_3/only-paragraphs.md +15 -0
  42. data/spec/push_spec.rb +211 -8
  43. data/spec/record_extractor_spec.rb +296 -358
  44. data/spec/spec_helper.rb +32 -11
  45. data/txt/record_too_big +19 -0
  46. metadata +40 -51
  47. data/scripts/watch +0 -1
@@ -3,643 +3,581 @@ require 'spec_helper'
3
3
  describe(AlgoliaSearchRecordExtractor) do
4
4
  let(:extractor) { AlgoliaSearchRecordExtractor }
5
5
  let(:site) { get_site }
6
- let(:page_file) { extractor.new(site.file_by_name('about.md')) }
7
- let(:html_page_file) { extractor.new(site.file_by_name('authors.html')) }
8
- let(:post_file) { extractor.new(site.file_by_name('test-post.md')) }
9
- let(:hierarchy_page_file) { extractor.new(site.file_by_name('hierarchy.md')) }
10
- let(:weight_page_file) { extractor.new(site.file_by_name('weight.md')) }
11
- let(:document_file) { extractor.new(site.file_by_name('collection-item.md')) }
6
+ let(:fixture_page) { extractor.new(site.file_by_name('about.md')) }
7
+ let(:fixture_post) { extractor.new(site.file_by_name('test-post.md')) }
8
+ let(:fixture_document) do
9
+ extractor.new(site.file_by_name('collection-item.md'))
10
+ end
11
+ let(:fixture_only_paragraphs) do
12
+ extractor.new(site.file_by_name('only-paragraphs.md'))
13
+ end
14
+ let(:fixture_front_matter) do
15
+ extractor.new(site.file_by_name('front_matter.md'))
16
+ end
12
17
 
13
18
  before(:each) do
14
- # Disabling the logs, while still allowing to spy them
15
- Jekyll.logger = double('Specific Mock Logger').as_null_object
16
- @logger = Jekyll.logger.writer
19
+ mock_logger
17
20
  end
18
21
 
19
- describe 'metadata' do
20
- it 'gets metadata from page' do
22
+ describe 'type' do
23
+ it 'should recognize a page' do
21
24
  # Given
22
- actual = page_file.metadata
23
-
24
- # Then
25
- expect(actual[:type]).to eq 'page'
26
- expect(actual[:slug]).to eq 'about'
27
- expect(actual[:title]).to eq 'About page'
28
- expect(actual[:url]).to eq '/about.html'
29
- expect(actual[:custom]).to eq 'Foo'
30
- end
25
+ input = fixture_page
31
26
 
32
- it 'gets metadata from post' do
33
- # Given
34
- actual = post_file.metadata
27
+ # When
28
+ actual = input.type
35
29
 
36
- # Then
37
- expect(actual[:slug]).to eq 'test-post'
38
- expect(actual[:title]).to eq 'Test post'
39
- expect(actual[:url]).to eq '/2015/07/02/test-post.html'
40
- expect(actual[:posted_at]).to eq 1_435_788_000
41
- expect(actual[:custom]).to eq 'Foo'
30
+ expect(actual).to eq 'page'
42
31
  end
43
32
 
44
- it 'gets posted_at timestamp based on the configured timezone' do
33
+ it 'should recognize a post' do
45
34
  # Given
46
- site = get_site(timezone: 'America/New_York')
47
- post_file = extractor.new(site.file_by_name('test-post.md'))
48
- actual = post_file.metadata
49
-
50
- # Then
51
- expect(actual[:posted_at]).to eq 1_435_809_600
52
- end
35
+ input = fixture_post
53
36
 
54
- it 'gets metadata from document' do
55
- # Given
56
- actual = document_file.metadata
37
+ # When
38
+ actual = input.type
57
39
 
58
- # Then
59
- expect(actual[:type]).to eq 'document'
60
- expect(actual[:slug]).to eq 'collection-item'
61
- expect(actual[:title]).to eq 'Collection Item'
62
- expect(actual[:url]).to eq '/my-collection/collection-item.html'
63
- expect(actual[:custom]).to eq 'Foo'
40
+ expect(actual).to eq 'post'
64
41
  end
65
42
 
66
- if restrict_jekyll_version(more_than: '3.0')
67
- describe 'Jekyll > 3.0' do
68
- it 'should not throw any deprecation warnings' do
69
- # Given
70
-
71
- # When
72
- post_file.metadata
43
+ it 'should recognize a document' do
44
+ # Given
45
+ input = fixture_document
73
46
 
74
- # Expect
75
- expect(@logger).to_not have_received(:warn)
76
- end
77
- end
47
+ # When
48
+ actual = input.type
78
49
 
50
+ expect(actual).to eq 'document'
79
51
  end
80
52
  end
81
53
 
82
- describe 'slug' do
83
- it 'gets it from data if available' do
54
+ describe 'url' do
55
+ it 'should use the page url' do
84
56
  # Given
85
- post_file.file.data['slug'] = 'foo'
86
- allow(post_file.file).to receive(:respond_to?).with(:slug) do
87
- false
88
- end
57
+ input = fixture_page
89
58
 
90
59
  # When
91
- actual = post_file.slug
60
+ actual = input.url
92
61
 
93
- # Then
94
- expect(actual).to eql('foo')
62
+ expect(actual).to eq '/about.html'
95
63
  end
96
64
 
97
- it 'gets it from the root if not in data' do
65
+ it 'should use the post url' do
98
66
  # Given
99
- post_file.file.data.delete 'slug'
100
- allow(post_file.file).to receive(:slug).and_return('foo')
67
+ input = fixture_post
101
68
 
102
69
  # When
103
- actual = post_file.slug
70
+ actual = input.url
104
71
 
105
- # Then
106
- expect(actual).to eql('foo')
72
+ expect(actual).to eq '/2015/07/02/test-post.html'
107
73
  end
108
74
 
109
- it 'gets it from the data even if in the root' do
75
+ it 'should use the document url' do
110
76
  # Given
111
- post_file.file.data['slug'] = 'foo'
112
- allow(post_file.file).to receive(:slug).and_return('bar')
77
+ input = fixture_document
113
78
 
114
79
  # When
115
- actual = post_file.slug
80
+ actual = input.url
116
81
 
117
- # Then
118
- expect(actual).to eql('foo')
82
+ expect(actual).to eq '/my-collection/collection-item.html'
119
83
  end
84
+ end
120
85
 
121
- it 'guesses it from the path if not found' do
86
+ describe 'title' do
87
+ it 'should use the page title' do
122
88
  # Given
123
- post_file.file.data.delete 'slug'
124
- allow(post_file.file).to receive(:respond_to?).with(:slug) do
125
- false
126
- end
127
- allow(post_file.file).to receive(:path) do
128
- '/path/to/file/foo.html'
129
- end
89
+ input = fixture_page
130
90
 
131
91
  # When
132
- actual = post_file.slug
92
+ actual = input.title
133
93
 
134
- # # Then
135
- expect(actual).to eql('foo')
94
+ expect(actual).to eq 'About page'
136
95
  end
137
- end
138
96
 
139
- describe 'tags' do
140
- it 'returns tags in data if available' do
97
+ it 'should use the post title' do
141
98
  # Given
142
- post_file.file.data['tags'] = %w(foo bar)
143
- allow(post_file.file).to receive(:respond_to?).with(:tags) do
144
- false
145
- end
99
+ input = fixture_post
146
100
 
147
101
  # When
148
- actual = post_file.tags
102
+ actual = input.title
149
103
 
150
- # Then
151
- expect(actual).to include('foo', 'bar')
104
+ expect(actual).to eq 'Test post'
152
105
  end
153
106
 
154
- it 'returns tags at the root if not in data' do
107
+ it 'should use the document title' do
155
108
  # Given
156
- post_file.file.data.delete 'tags'
157
- allow(post_file.file).to receive(:tags).and_return(%w(foo bar))
109
+ input = fixture_document
158
110
 
159
111
  # When
160
- actual = post_file.tags
112
+ actual = input.title
161
113
 
162
- # Then
163
- expect(actual).to include('foo', 'bar')
114
+ expect(actual).to eq 'Collection Item'
115
+ end
116
+ end
117
+
118
+ describe 'slug' do
119
+ if restrict_jekyll_version(more_than: '3.0')
120
+ it 'should not throw a deprecation warning' do
121
+ # Given
122
+ input = fixture_post
123
+
124
+ # When
125
+ input.slug
126
+
127
+ # Then
128
+ expect(Jekyll.logger)
129
+ .to_not have_received(:warn).with('Deprecation:', any_args)
130
+ end
164
131
  end
165
132
 
166
- it 'returns tags in data even if in root' do
133
+ it 'should get it for a page' do
167
134
  # Given
168
- post_file.file.data['tags'] = %w(foo bar)
169
- allow(post_file.file).to receive(:tags).and_return(%w(js css))
135
+ input = fixture_page
170
136
 
171
137
  # When
172
- actual = post_file.tags
138
+ actual = input.slug
173
139
 
174
- # Then
175
- expect(actual).to include('foo', 'bar')
140
+ expect(actual).to eq 'about'
176
141
  end
177
142
 
178
- it 'parses tags as string if they are another type' do
143
+ it 'should get it for a post' do
179
144
  # Given
180
- tag_foo = double('Extended Tag', to_s: 'foo')
181
- tag_bar = double('Extended Tag', to_s: 'bar')
182
- post_file.file.data['tags'] = [tag_foo, tag_bar]
183
- allow(post_file.file).to receive(:respond_to?).with(:tags) do
184
- false
185
- end
145
+ input = fixture_post
186
146
 
187
147
  # When
188
- actual = post_file.tags
148
+ actual = input.slug
189
149
 
190
- # Then
191
- expect(actual).to include('foo', 'bar')
150
+ expect(actual).to eq 'test-post'
192
151
  end
193
152
 
194
- it 'extract tags from front matter' do
153
+ it 'should get it for a document' do
195
154
  # Given
196
- actual = post_file.tags
155
+ input = fixture_document
197
156
 
198
- # Then
199
- expect(actual).to include('tag', 'another tag')
157
+ # When
158
+ actual = input.slug
159
+
160
+ expect(actual).to eq 'collection-item'
200
161
  end
201
162
  end
202
163
 
203
- describe 'html_nodes' do
204
- it 'returns the list of all <p> by default' do
205
- expect(page_file.html_nodes.size).to eq 6
206
- end
164
+ describe 'tags' do
165
+ if restrict_jekyll_version(more_than: '3.0')
166
+ it 'should not throw a deprecation warning' do
167
+ # Given
168
+ input = fixture_post
207
169
 
208
- it 'allow _config.yml to override the selector' do
209
- # Given
210
- site = get_site(algolia: { 'record_css_selector' => 'p,ul' })
211
- page_file = extractor.new(site.file_by_name('about.md'))
170
+ # When
171
+ input.tags
212
172
 
213
- expect(page_file.html_nodes.size).to eq 7
173
+ # Then
174
+ expect(Jekyll.logger)
175
+ .to_not have_received(:warn).with('Deprecation:', any_args)
176
+ end
214
177
  end
215
- end
216
178
 
217
- describe 'node_heading_parent' do
218
- it 'returns the direct heading right above' do
179
+ it 'should get tags from page' do
219
180
  # Given
220
- nodes = hierarchy_page_file.html_nodes
221
- p = nodes[0]
181
+ input = fixture_page
222
182
 
223
183
  # When
224
- actual = hierarchy_page_file.node_heading_parent(p)
184
+ actual = input.tags
225
185
 
226
- # Then
227
- expect(actual.name).to eq 'h1'
228
- expect(actual.text).to eq 'H1'
186
+ expect(actual).to include('tag', 'another tag')
229
187
  end
230
188
 
231
- it 'returns the closest heading even if in a sub tag' do
189
+ it 'should get tags from post' do
232
190
  # Given
233
- nodes = hierarchy_page_file.html_nodes
234
- p = nodes[2]
191
+ input = fixture_post
235
192
 
236
193
  # When
237
- actual = hierarchy_page_file.node_heading_parent(p)
194
+ actual = input.tags
238
195
 
239
- # Then
240
- expect(actual.name).to eq 'h2'
241
- expect(actual.text).to eq 'H2A'
196
+ expect(actual).to include('tag', 'another tag')
242
197
  end
243
198
 
244
- it 'should automatically go up one level when indexing headings' do
199
+ it 'should get tags from document' do
245
200
  # Given
246
- site = get_site(algolia: { 'record_css_selector' => 'p,h2' })
247
- hierarchy_page_file = extractor.new(site.file_by_name('hierarchy.md'))
248
- nodes = hierarchy_page_file.html_nodes
249
- h2 = nodes[4]
201
+ input = fixture_document
250
202
 
251
203
  # When
252
- actual = hierarchy_page_file.node_heading_parent(h2)
204
+ actual = input.tags
253
205
 
254
- # Then
255
- expect(actual.name).to eq 'h1'
256
- expect(actual.text).to eq 'H1'
206
+ expect(actual).to include('tag', 'another tag')
257
207
  end
258
208
 
259
- it 'should find the correct parent when indexing deep headings' do
209
+ it 'should handle custom extended tags' do
260
210
  # Given
261
- site = get_site(algolia: { 'record_css_selector' => 'h2' })
262
- hierarchy_page_file = extractor.new(site.file_by_name('hierarchy.md'))
263
- nodes = hierarchy_page_file.html_nodes
264
- h2 = nodes[2]
211
+ extended_tags = [
212
+ double('Extended Tag', to_s: 'extended tag'),
213
+ double('Extended Tag', to_s: 'extended another tag')
214
+ ]
215
+ input = fixture_post
216
+
217
+ # Overwrite string tags with more advanced ones
218
+ if restrict_jekyll_version(less_than: '3.0')
219
+ allow(input.file).to receive(:tags) { extended_tags }
220
+ else
221
+ input.file.data['tags'] = extended_tags
222
+ end
265
223
 
266
224
  # When
267
- actual = hierarchy_page_file.node_heading_parent(h2)
225
+ actual = input.tags
268
226
 
269
- # Then
270
- expect(actual.name).to eq 'h1'
271
- expect(actual.text).to eq 'H1'
227
+ expect(actual).to include('extended tag', 'extended another tag')
272
228
  end
273
229
  end
274
230
 
275
- describe 'node_hierarchy' do
276
- it 'returns the unique parent of a simple element' do
277
- # Note: First <p> should only have a h1 as hierarchy
231
+ describe 'date' do
232
+ it 'should get the date as a timestamp for posts' do
278
233
  # Given
279
- nodes = hierarchy_page_file.html_nodes
280
- p = nodes[0]
234
+ input = fixture_post
281
235
 
282
236
  # When
283
- actual = hierarchy_page_file.node_hierarchy(p)
237
+ actual = input.date
284
238
 
285
239
  # Then
286
- expect(actual).to include(h1: 'H1')
240
+ expect(actual).to eq 1_435_788_000
287
241
  end
288
242
 
289
- it 'returns the heading hierarchy of multiple headings' do
290
- # Note: 5th <p> is inside h3, second h2 and main h1
243
+ it 'should be nil for pages' do
291
244
  # Given
292
- nodes = hierarchy_page_file.html_nodes
293
- p = nodes[4]
245
+ input = fixture_page
294
246
 
295
247
  # When
296
- actual = hierarchy_page_file.node_hierarchy(p)
248
+ actual = input.date
297
249
 
298
250
  # Then
299
- expect(actual).to include(h1: 'H1', h2: 'H2B', h3: 'H3A')
251
+ expect(actual).to eq nil
300
252
  end
301
253
 
302
- it 'works even if heading not on the same level' do
303
- # Note: The 6th <p> is inside a div
254
+ it 'should generate the timestamp relative to the configured timezone' do
304
255
  # Given
305
- nodes = hierarchy_page_file.html_nodes
306
- p = nodes[5]
256
+ site = get_site(timezone: 'America/New_York')
257
+ input = extractor.new(site.file_by_name('test-post.md'))
307
258
 
308
259
  # When
309
- actual = hierarchy_page_file.node_hierarchy(p)
260
+ actual = input.date
310
261
 
311
262
  # Then
312
- expect(actual).to include(h1: 'H1', h2: 'H2B', h3: 'H3A', h4: 'H4')
263
+ expect(actual).to eq 1_435_809_600
313
264
  end
265
+ end
314
266
 
315
- it 'includes node in the output if headings are indexed' do
267
+ describe 'collection' do
268
+ it 'should get the collection name for documents' do
316
269
  # Given
317
- site = get_site(algolia: { 'record_css_selector' => 'h1' })
318
- hierarchy_page_file = extractor.new(site.file_by_name('hierarchy.md'))
319
- nodes = hierarchy_page_file.html_nodes
320
- h1 = nodes[0]
270
+ input = fixture_document
321
271
 
322
272
  # When
323
- actual = hierarchy_page_file.node_hierarchy(h1)
273
+ actual = input.collection
324
274
 
325
275
  # Then
326
- expect(actual).to include(h1: 'H1')
276
+ expect(actual).to eq 'my-collection'
327
277
  end
328
278
 
329
- it 'escape html in headings' do
279
+ it 'should be nil for pages' do
330
280
  # Given
331
- nodes = hierarchy_page_file.html_nodes
332
- p = nodes[7]
281
+ input = fixture_page
333
282
 
334
283
  # When
335
- actual = hierarchy_page_file.node_hierarchy(p)
284
+ actual = input.collection
336
285
 
337
286
  # Then
338
- expect(actual).to include(h3: 'H3B &lt;code&gt;')
287
+ expect(actual).to eq nil
339
288
  end
340
- end
341
289
 
342
- describe 'node_raw_html' do
343
- it 'returns html including surrounding tags' do
344
- # Note: 3rd <p> is a real HTML with a custom class
290
+ it 'should be nil for posts' do
345
291
  # Given
346
- nodes = page_file.html_nodes
347
- p = nodes[3]
292
+ input = fixture_post
348
293
 
349
294
  # When
350
- actual = page_file.node_raw_html(p)
295
+ actual = input.collection
351
296
 
352
297
  # Then
353
- expect(actual).to eq '<p id="text4">Another text 4</p>'
298
+ expect(actual).to eq nil
354
299
  end
355
300
  end
356
301
 
357
- describe 'node_text' do
358
- it 'returns inner text with <> escaped' do
359
- # Note: 4th <p> contains a <code> tag with <>
302
+ describe 'front_matter' do
303
+ it 'should get a hash of all front matter data' do
360
304
  # Given
361
- nodes = page_file.html_nodes
362
- p = nodes[4]
305
+ input = fixture_front_matter
363
306
 
364
307
  # When
365
- actual = page_file.node_text(p)
308
+ actual = input.front_matter
366
309
 
367
310
  # Then
368
- expect(actual).to eq 'Another &lt;text&gt; 5'
311
+ expect(actual[:author]).to eq 'John Doe'
312
+ expect(actual[:custom]).to eq 'foo'
369
313
  end
370
- end
371
314
 
372
- describe 'unique_hierarchy' do
373
- it 'combines title and headings' do
315
+ it 'should remove known keys from the front-matter' do
374
316
  # Given
375
- hierarchy = {
376
- title: 'title',
377
- h1: 'h1',
378
- h2: 'h2',
379
- h3: 'h3',
380
- h4: 'h4',
381
- h5: 'h5',
382
- h6: 'h6'
383
- }
317
+ input = fixture_front_matter
384
318
 
385
319
  # When
386
- actual = page_file.unique_hierarchy(hierarchy)
320
+ actual = input.front_matter
387
321
 
388
322
  # Then
389
- expect(actual).to eq 'title > h1 > h2 > h3 > h4 > h5 > h6'
323
+ expect(actual[:title]).to eq nil
324
+ expect(actual[:tags]).to eq nil
325
+ expect(actual[:slug]).to eq nil
326
+ expect(actual[:url]).to eq nil
327
+ expect(actual[:date]).to eq nil
328
+ expect(actual[:type]).to eq nil
390
329
  end
391
330
 
392
- it 'combines title and headings even with missing elements' do
331
+ it 'should cast keys as symbols' do
393
332
  # Given
394
- hierarchy = {
395
- title: 'title',
396
- h2: 'h2',
397
- h4: 'h4',
398
- h6: 'h6'
399
- }
333
+ input = fixture_front_matter
400
334
 
401
335
  # When
402
- actual = page_file.unique_hierarchy(hierarchy)
336
+ actual = input.front_matter
403
337
 
404
338
  # Then
405
- expect(actual).to eq 'title > h2 > h4 > h6'
339
+ expect(actual['custom']).to eq nil
340
+ expect(actual[:custom]).to_not eq nil
341
+ expect(actual['author']).to eq nil
342
+ expect(actual[:author]).to_not eq nil
406
343
  end
407
344
  end
408
345
 
409
- describe 'node_css_selector' do
410
- it 'uses the #id to make the selector more precise if one is found' do
346
+ describe 'extract' do
347
+ it 'should get one item per node' do
411
348
  # Given
412
- nodes = page_file.html_nodes
413
- p = nodes[3]
349
+ input = fixture_only_paragraphs
414
350
 
415
351
  # When
416
- actual = page_file.node_css_selector(p)
352
+ actual = input.extract
417
353
 
418
354
  # Then
419
- expect(actual).to eq '#text4'
355
+ expect(actual.size).to eq 6
420
356
  end
421
357
 
422
- it 'uses p:nth-of-type if no #id found' do
358
+ it 'should get a complete record' do
423
359
  # Given
424
- nodes = page_file.html_nodes
425
- p = nodes[2]
360
+ input = fixture_page
426
361
 
427
362
  # When
428
- actual = page_file.node_css_selector(p)
363
+ actual = input.extract
429
364
 
430
365
  # Then
431
- expect(actual).to eq 'p:nth-of-type(3)'
366
+ # Jekyll auto-generates anchors on heading
367
+ expect(actual[0][:anchor]).to eq 'heading-1'
368
+ # It's a page, so no date
369
+ expect(actual[0][:date]).to eq nil
370
+ # Hierarchy on first level
371
+ expect(actual[0][:hierarchy][:lvl0]).to eq 'Heading 1'
372
+ expect(actual[0][:hierarchy][:lvl1]).to eq nil
373
+ # Node content
374
+ expect(actual[0][:tag_name]).to eq 'p'
375
+ expect(actual[0][:html]).to eq '<p>Text 1</p>'
376
+ expect(actual[0][:text]).to eq 'Text 1'
377
+ # Page
378
+ expect(actual[0][:title]).to eq 'About page'
379
+ expect(actual[0][:slug]).to eq 'about'
380
+ expect(actual[0][:url]).to eq '/about.html'
381
+ # Tags
382
+ expect(actual[0][:tags]).to eq ['tag', 'another tag']
383
+ # Weight
384
+ expect(actual[0][:weight][:heading]).to eq 90
385
+ expect(actual[0][:weight][:position]).to eq 0
432
386
  end
433
387
 
434
- it 'handles custom <div> markup' do
388
+ it 'should allow overriding the node selector' do
435
389
  # Given
436
- nodes = page_file.html_nodes
437
- p = nodes[5]
390
+ site = get_site(algolia: { 'record_css_selector' => 'div' })
391
+ input = extractor.new(site.file_by_name('only-divs.md'))
438
392
 
439
393
  # When
440
- actual = page_file.node_css_selector(p)
394
+ actual = input.extract
441
395
 
442
396
  # Then
443
- expect(actual).to eq 'div:nth-of-type(2) > p'
397
+ expect(actual.size).to eq 6
444
398
  end
445
- end
446
399
 
447
- describe 'weight_heading_relevance' do
448
- it 'gets the number of words in text also in the title' do
400
+ it 'should contain all the basic top level info' do
449
401
  # Given
450
- data = {
451
- title: 'foo bar',
452
- text: 'Lorem ipsum dolor foo bar, consectetur adipiscing elit'
453
- }
402
+ input = fixture_page
403
+ allow(input).to receive(:date) { 'mock_date' }
404
+ allow(input).to receive(:slug) { 'mock_slug' }
405
+ allow(input).to receive(:tags) { 'mock_tags' }
406
+ allow(input).to receive(:title) { 'mock_title' }
407
+ allow(input).to receive(:url) { 'mock_url' }
408
+ allow(input).to receive(:type) { 'mock_type' }
454
409
 
455
410
  # When
456
- actual = page_file.weight_heading_relevance(data)
411
+ actual = input.extract
457
412
 
458
413
  # Then
459
- expect(actual).to eq 2
414
+ expect(actual[0][:date]).to eq 'mock_date'
415
+ expect(actual[0][:slug]).to eq 'mock_slug'
416
+ expect(actual[0][:tags]).to eq 'mock_tags'
417
+ expect(actual[0][:title]).to eq 'mock_title'
418
+ expect(actual[0][:url]).to eq 'mock_url'
419
+ expect(actual[0][:type]).to eq 'mock_type'
460
420
  end
461
421
 
462
- it 'gets the number of words in text also in the headings' do
422
+ it 'should add node data from extractor' do
463
423
  # Given
464
- data = {
465
- title: 'foo',
466
- h1: 'bar',
467
- h2: 'baz',
468
- text: 'Lorem baz dolor foo bar, consectetur adipiscing elit'
469
- }
424
+ input = fixture_page
425
+ allow(input).to receive(:hierarchy_nodes) do
426
+ [
427
+ { name: 'foo' },
428
+ { name: 'bar' }
429
+ ]
430
+ end
470
431
 
471
432
  # When
472
- actual = page_file.weight_heading_relevance(data)
433
+ actual = input.extract
473
434
 
474
435
  # Then
475
- expect(actual).to eq 3
436
+ expect(actual[0][:name]).to eq 'foo'
476
437
  end
477
438
 
478
- it 'count each word only once' do
439
+ it 'should not expose the HTML node' do
479
440
  # Given
480
- data = {
481
- title: 'foo',
482
- h1: 'foo foo foo',
483
- h2: 'bar bar foo bar',
484
- text: 'foo bar bar bar bar baz foo bar baz'
485
- }
441
+ input = fixture_only_paragraphs
486
442
 
487
443
  # When
488
- actual = page_file.weight_heading_relevance(data)
444
+ actual = input.extract
489
445
 
490
446
  # Then
491
- expect(actual).to eq 2
447
+ expect(actual[0][:node]).to eq nil
492
448
  end
493
449
 
494
- it 'is case-insensitive' do
450
+ it 'should set the objectID as a hash' do
495
451
  # Given
496
- data = {
497
- title: 'FOO',
498
- h1: 'bar Bar BAR',
499
- text: 'foo BAR'
500
- }
452
+ input = fixture_page
501
453
 
502
454
  # When
503
- actual = page_file.weight_heading_relevance(data)
455
+ actual = input.extract
504
456
 
505
457
  # Then
506
- expect(actual).to eq 2
458
+ expect(actual[0]).not_to have_key(:uuid)
459
+ expect(actual[0]).to have_key(:objectID)
507
460
  end
508
461
 
509
- it 'should only use words, no partial matches' do
462
+ it 'should not contain a collection key for pages' do
510
463
  # Given
511
- data = {
512
- title: 'foo bar',
513
- text: 'xxxfooxxx bar'
514
- }
464
+ input = fixture_page
515
465
 
516
466
  # When
517
- actual = page_file.weight_heading_relevance(data)
467
+ actual = input.extract
518
468
 
519
469
  # Then
520
- expect(actual).to eq 1
470
+ expect(actual[0]).not_to have_key(:collection)
521
471
  end
522
472
 
523
- it 'should still work with non-string keys' do
473
+ it 'should not contain a collection key for posts' do
524
474
  # Given
525
- data = {
526
- title: nil,
527
- h1: [],
528
- h2: {},
529
- h3: true,
530
- h4: false,
531
- h5: 'foo bar',
532
- text: 'foo bar'
533
- }
475
+ input = fixture_post
534
476
 
535
477
  # When
536
- actual = page_file.weight_heading_relevance(data)
478
+ actual = input.extract
537
479
 
538
480
  # Then
539
- expect(actual).to eq 2
481
+ expect(actual[0]).not_to have_key(:collection)
540
482
  end
541
- end
542
483
 
543
- describe 'weight_tag_name' do
544
- it 'gives a score of 0 to non-headings' do
484
+ it 'should contain the collection name for documents' do
545
485
  # Given
546
- data = {
547
- tag_name: 'p'
548
- }
486
+ page = fixture_document
549
487
 
550
488
  # When
551
- actual = page_file.weight_tag_name(data)
489
+ page_data = page.extract
552
490
 
553
491
  # Then
554
- expect(actual).to eq 0
492
+ expect(page_data[0][:collection]).to eq 'my-collection'
555
493
  end
556
- it 'gives a score of 100 to h1' do
494
+
495
+ it 'should not contain a date key for pages' do
557
496
  # Given
558
- data = {
559
- tag_name: 'h1'
560
- }
497
+ input = fixture_page
561
498
 
562
499
  # When
563
- actual = page_file.weight_tag_name(data)
500
+ actual = input.extract
564
501
 
565
502
  # Then
566
- expect(actual).to eq 100
503
+ expect(actual[0]).not_to have_key(:date)
567
504
  end
568
- it 'gives a score of 40 to h6' do
505
+ end
506
+
507
+ describe 'custom_hook_each' do
508
+ it 'should be called on every item' do
569
509
  # Given
570
- data = {
571
- tag_name: 'h6'
572
- }
510
+ input = fixture_page
511
+ allow(input).to receive(:custom_hook_each).and_call_original
573
512
 
574
513
  # When
575
- actual = page_file.weight_tag_name(data)
514
+ actual = input.extract
576
515
 
577
516
  # Then
578
- expect(actual).to eq 50
517
+ expect(input).to have_received(:custom_hook_each)
518
+ .exactly(actual.size).times
579
519
  end
580
- end
581
520
 
582
- describe 'weight' do
583
- it 'returns an object with all weights' do
521
+ it 'should let users change the item' do
584
522
  # Given
585
- item = {
586
- tag_name: 'p'
587
- }
588
- allow(page_file).to receive(:weight_tag_name) { 10 }
589
- allow(page_file).to receive(:weight_heading_relevance) { 20 }
523
+ input = fixture_page
524
+ def input.custom_hook_each(item, _)
525
+ item['foo'] = 'bar'
526
+ item
527
+ end
590
528
 
591
529
  # When
592
- actual = page_file.weight(item, 42)
530
+ actual = input.extract
593
531
 
594
532
  # Then
595
- expect(actual).to include(tag_name: 10)
596
- expect(actual).to include(heading_relevance: 20)
597
- expect(actual).to include(position: 42)
533
+ expect(actual[0]['foo']).to eq 'bar'
598
534
  end
599
- end
600
535
 
601
- describe 'custom_hook_each' do
602
- it 'let the user call a custom hook to modify a record' do
536
+ it 'should let a user remove an item by returning nil' do
603
537
  # Given
604
- def page_file.custom_hook_each(item, _)
605
- item[:custom_attribute] = 'foo'
606
- item
538
+ input = fixture_page
539
+ def input.custom_hook_each(_, _)
540
+ nil
607
541
  end
608
542
 
609
543
  # When
610
- actual = page_file.extract
544
+ actual = input.extract
611
545
 
612
546
  # Then
613
- expect(actual[0]).to include(custom_attribute: 'foo')
547
+ expect(actual.size).to eq 0
614
548
  end
615
549
 
616
- it 'let the user discard a record by returning nil' do
550
+ it 'should be passed the Nokogiri node as second argument' do
617
551
  # Given
618
- def page_file.custom_hook_each(_, _)
619
- nil
552
+ input = fixture_page
553
+ def input.custom_hook_each(item, nokogiri_node)
554
+ item['foo'] = nokogiri_node
555
+ item
620
556
  end
621
557
 
622
558
  # When
623
- actual = page_file.extract
559
+ actual = input.extract
624
560
 
625
561
  # Then
626
- expect(actual.size).to eq 0
562
+ expect(actual[0]['foo']).to be_an(Nokogiri::XML::Element)
627
563
  end
628
564
  end
629
565
 
630
566
  describe 'custom_hook_all' do
631
- it 'let the user call a custom hook to modify the list of records' do
567
+ it 'should let the user update the list of records' do
632
568
  # Given
633
- def page_file.custom_hook_all(items)
634
- [items[0], { foo: 'bar' }]
569
+ input = fixture_page
570
+ def input.custom_hook_all(_)
571
+ [{
572
+ 'foo' => 'bar'
573
+ }]
635
574
  end
636
575
 
637
576
  # When
638
- actual = page_file.extract
577
+ actual = input.extract
639
578
 
640
579
  # Then
641
- expect(actual.size).to eq 2
642
- expect(actual[1]).to include(foo: 'bar')
580
+ expect(actual[0]['foo']).to eq 'bar'
643
581
  end
644
582
  end
645
583
  end