algoliasearch-jekyll 0.9.1 → 1.0.0.beta.pre.1

Sign up to get free protection for your applications and to get access to all the features.
Files changed (47) hide show
  1. checksums.yaml +4 -4
  2. data/.travis.yml +3 -4
  3. data/CONTRIBUTING.md +8 -1
  4. data/Gemfile +4 -5
  5. data/README.md +318 -11
  6. data/Rakefile +7 -12
  7. data/algoliasearch-jekyll.gemspec +66 -62
  8. data/gemfiles/jekyll_v2.gemfile +3 -3
  9. data/gemfiles/jekyll_v3.gemfile +4 -4
  10. data/gemfiles/jekyll_v3_1_3.gemfile +24 -0
  11. data/gemfiles/jekyll_v3_1_6.gemfile +24 -0
  12. data/lib/algoliasearch-jekyll.rb +1 -3
  13. data/lib/credential_checker.rb +2 -1
  14. data/lib/error_handler.rb +6 -0
  15. data/lib/push.rb +81 -19
  16. data/lib/record_extractor.rb +120 -140
  17. data/lib/utils.rb +13 -0
  18. data/lib/version.rb +1 -1
  19. data/scripts/release +13 -12
  20. data/scripts/test_v3 +1 -1
  21. data/scripts/watch +4 -0
  22. data/spec/error_handler_spec.rb +17 -0
  23. data/spec/fixtures/jekyll_version_2/404.html +8 -0
  24. data/spec/fixtures/jekyll_version_2/404.md +9 -0
  25. data/spec/fixtures/jekyll_version_2/_my-collection/collection-item.md +3 -0
  26. data/spec/fixtures/jekyll_version_2/_posts/2015-07-02-test-post.md +1 -1
  27. data/spec/fixtures/jekyll_version_2/about.md +3 -0
  28. data/spec/fixtures/jekyll_version_2/front_matter.md +15 -0
  29. data/spec/fixtures/jekyll_version_2/index.html +3 -1
  30. data/spec/fixtures/jekyll_version_2/only-divs.md +15 -0
  31. data/spec/fixtures/jekyll_version_2/only-paragraphs.md +15 -0
  32. data/spec/fixtures/jekyll_version_3/404.html +8 -0
  33. data/spec/fixtures/jekyll_version_3/404.md +9 -0
  34. data/spec/fixtures/jekyll_version_3/_config.yml +1 -1
  35. data/spec/fixtures/jekyll_version_3/_my-collection/collection-item.md +3 -0
  36. data/spec/fixtures/jekyll_version_3/_posts/2015-07-02-test-post.md +1 -1
  37. data/spec/fixtures/jekyll_version_3/about.md +3 -0
  38. data/spec/fixtures/jekyll_version_3/front_matter.md +15 -0
  39. data/spec/fixtures/jekyll_version_3/index.html +4 -1
  40. data/spec/fixtures/jekyll_version_3/only-divs.md +15 -0
  41. data/spec/fixtures/jekyll_version_3/only-paragraphs.md +15 -0
  42. data/spec/push_spec.rb +211 -8
  43. data/spec/record_extractor_spec.rb +296 -358
  44. data/spec/spec_helper.rb +32 -11
  45. data/txt/record_too_big +19 -0
  46. metadata +40 -51
  47. data/scripts/watch +0 -1
@@ -3,643 +3,581 @@ require 'spec_helper'
3
3
  describe(AlgoliaSearchRecordExtractor) do
4
4
  let(:extractor) { AlgoliaSearchRecordExtractor }
5
5
  let(:site) { get_site }
6
- let(:page_file) { extractor.new(site.file_by_name('about.md')) }
7
- let(:html_page_file) { extractor.new(site.file_by_name('authors.html')) }
8
- let(:post_file) { extractor.new(site.file_by_name('test-post.md')) }
9
- let(:hierarchy_page_file) { extractor.new(site.file_by_name('hierarchy.md')) }
10
- let(:weight_page_file) { extractor.new(site.file_by_name('weight.md')) }
11
- let(:document_file) { extractor.new(site.file_by_name('collection-item.md')) }
6
+ let(:fixture_page) { extractor.new(site.file_by_name('about.md')) }
7
+ let(:fixture_post) { extractor.new(site.file_by_name('test-post.md')) }
8
+ let(:fixture_document) do
9
+ extractor.new(site.file_by_name('collection-item.md'))
10
+ end
11
+ let(:fixture_only_paragraphs) do
12
+ extractor.new(site.file_by_name('only-paragraphs.md'))
13
+ end
14
+ let(:fixture_front_matter) do
15
+ extractor.new(site.file_by_name('front_matter.md'))
16
+ end
12
17
 
13
18
  before(:each) do
14
- # Disabling the logs, while still allowing to spy them
15
- Jekyll.logger = double('Specific Mock Logger').as_null_object
16
- @logger = Jekyll.logger.writer
19
+ mock_logger
17
20
  end
18
21
 
19
- describe 'metadata' do
20
- it 'gets metadata from page' do
22
+ describe 'type' do
23
+ it 'should recognize a page' do
21
24
  # Given
22
- actual = page_file.metadata
23
-
24
- # Then
25
- expect(actual[:type]).to eq 'page'
26
- expect(actual[:slug]).to eq 'about'
27
- expect(actual[:title]).to eq 'About page'
28
- expect(actual[:url]).to eq '/about.html'
29
- expect(actual[:custom]).to eq 'Foo'
30
- end
25
+ input = fixture_page
31
26
 
32
- it 'gets metadata from post' do
33
- # Given
34
- actual = post_file.metadata
27
+ # When
28
+ actual = input.type
35
29
 
36
- # Then
37
- expect(actual[:slug]).to eq 'test-post'
38
- expect(actual[:title]).to eq 'Test post'
39
- expect(actual[:url]).to eq '/2015/07/02/test-post.html'
40
- expect(actual[:posted_at]).to eq 1_435_788_000
41
- expect(actual[:custom]).to eq 'Foo'
30
+ expect(actual).to eq 'page'
42
31
  end
43
32
 
44
- it 'gets posted_at timestamp based on the configured timezone' do
33
+ it 'should recognize a post' do
45
34
  # Given
46
- site = get_site(timezone: 'America/New_York')
47
- post_file = extractor.new(site.file_by_name('test-post.md'))
48
- actual = post_file.metadata
49
-
50
- # Then
51
- expect(actual[:posted_at]).to eq 1_435_809_600
52
- end
35
+ input = fixture_post
53
36
 
54
- it 'gets metadata from document' do
55
- # Given
56
- actual = document_file.metadata
37
+ # When
38
+ actual = input.type
57
39
 
58
- # Then
59
- expect(actual[:type]).to eq 'document'
60
- expect(actual[:slug]).to eq 'collection-item'
61
- expect(actual[:title]).to eq 'Collection Item'
62
- expect(actual[:url]).to eq '/my-collection/collection-item.html'
63
- expect(actual[:custom]).to eq 'Foo'
40
+ expect(actual).to eq 'post'
64
41
  end
65
42
 
66
- if restrict_jekyll_version(more_than: '3.0')
67
- describe 'Jekyll > 3.0' do
68
- it 'should not throw any deprecation warnings' do
69
- # Given
70
-
71
- # When
72
- post_file.metadata
43
+ it 'should recognize a document' do
44
+ # Given
45
+ input = fixture_document
73
46
 
74
- # Expect
75
- expect(@logger).to_not have_received(:warn)
76
- end
77
- end
47
+ # When
48
+ actual = input.type
78
49
 
50
+ expect(actual).to eq 'document'
79
51
  end
80
52
  end
81
53
 
82
- describe 'slug' do
83
- it 'gets it from data if available' do
54
+ describe 'url' do
55
+ it 'should use the page url' do
84
56
  # Given
85
- post_file.file.data['slug'] = 'foo'
86
- allow(post_file.file).to receive(:respond_to?).with(:slug) do
87
- false
88
- end
57
+ input = fixture_page
89
58
 
90
59
  # When
91
- actual = post_file.slug
60
+ actual = input.url
92
61
 
93
- # Then
94
- expect(actual).to eql('foo')
62
+ expect(actual).to eq '/about.html'
95
63
  end
96
64
 
97
- it 'gets it from the root if not in data' do
65
+ it 'should use the post url' do
98
66
  # Given
99
- post_file.file.data.delete 'slug'
100
- allow(post_file.file).to receive(:slug).and_return('foo')
67
+ input = fixture_post
101
68
 
102
69
  # When
103
- actual = post_file.slug
70
+ actual = input.url
104
71
 
105
- # Then
106
- expect(actual).to eql('foo')
72
+ expect(actual).to eq '/2015/07/02/test-post.html'
107
73
  end
108
74
 
109
- it 'gets it from the data even if in the root' do
75
+ it 'should use the document url' do
110
76
  # Given
111
- post_file.file.data['slug'] = 'foo'
112
- allow(post_file.file).to receive(:slug).and_return('bar')
77
+ input = fixture_document
113
78
 
114
79
  # When
115
- actual = post_file.slug
80
+ actual = input.url
116
81
 
117
- # Then
118
- expect(actual).to eql('foo')
82
+ expect(actual).to eq '/my-collection/collection-item.html'
119
83
  end
84
+ end
120
85
 
121
- it 'guesses it from the path if not found' do
86
+ describe 'title' do
87
+ it 'should use the page title' do
122
88
  # Given
123
- post_file.file.data.delete 'slug'
124
- allow(post_file.file).to receive(:respond_to?).with(:slug) do
125
- false
126
- end
127
- allow(post_file.file).to receive(:path) do
128
- '/path/to/file/foo.html'
129
- end
89
+ input = fixture_page
130
90
 
131
91
  # When
132
- actual = post_file.slug
92
+ actual = input.title
133
93
 
134
- # # Then
135
- expect(actual).to eql('foo')
94
+ expect(actual).to eq 'About page'
136
95
  end
137
- end
138
96
 
139
- describe 'tags' do
140
- it 'returns tags in data if available' do
97
+ it 'should use the post title' do
141
98
  # Given
142
- post_file.file.data['tags'] = %w(foo bar)
143
- allow(post_file.file).to receive(:respond_to?).with(:tags) do
144
- false
145
- end
99
+ input = fixture_post
146
100
 
147
101
  # When
148
- actual = post_file.tags
102
+ actual = input.title
149
103
 
150
- # Then
151
- expect(actual).to include('foo', 'bar')
104
+ expect(actual).to eq 'Test post'
152
105
  end
153
106
 
154
- it 'returns tags at the root if not in data' do
107
+ it 'should use the document title' do
155
108
  # Given
156
- post_file.file.data.delete 'tags'
157
- allow(post_file.file).to receive(:tags).and_return(%w(foo bar))
109
+ input = fixture_document
158
110
 
159
111
  # When
160
- actual = post_file.tags
112
+ actual = input.title
161
113
 
162
- # Then
163
- expect(actual).to include('foo', 'bar')
114
+ expect(actual).to eq 'Collection Item'
115
+ end
116
+ end
117
+
118
+ describe 'slug' do
119
+ if restrict_jekyll_version(more_than: '3.0')
120
+ it 'should not throw a deprecation warning' do
121
+ # Given
122
+ input = fixture_post
123
+
124
+ # When
125
+ input.slug
126
+
127
+ # Then
128
+ expect(Jekyll.logger)
129
+ .to_not have_received(:warn).with('Deprecation:', any_args)
130
+ end
164
131
  end
165
132
 
166
- it 'returns tags in data even if in root' do
133
+ it 'should get it for a page' do
167
134
  # Given
168
- post_file.file.data['tags'] = %w(foo bar)
169
- allow(post_file.file).to receive(:tags).and_return(%w(js css))
135
+ input = fixture_page
170
136
 
171
137
  # When
172
- actual = post_file.tags
138
+ actual = input.slug
173
139
 
174
- # Then
175
- expect(actual).to include('foo', 'bar')
140
+ expect(actual).to eq 'about'
176
141
  end
177
142
 
178
- it 'parses tags as string if they are another type' do
143
+ it 'should get it for a post' do
179
144
  # Given
180
- tag_foo = double('Extended Tag', to_s: 'foo')
181
- tag_bar = double('Extended Tag', to_s: 'bar')
182
- post_file.file.data['tags'] = [tag_foo, tag_bar]
183
- allow(post_file.file).to receive(:respond_to?).with(:tags) do
184
- false
185
- end
145
+ input = fixture_post
186
146
 
187
147
  # When
188
- actual = post_file.tags
148
+ actual = input.slug
189
149
 
190
- # Then
191
- expect(actual).to include('foo', 'bar')
150
+ expect(actual).to eq 'test-post'
192
151
  end
193
152
 
194
- it 'extract tags from front matter' do
153
+ it 'should get it for a document' do
195
154
  # Given
196
- actual = post_file.tags
155
+ input = fixture_document
197
156
 
198
- # Then
199
- expect(actual).to include('tag', 'another tag')
157
+ # When
158
+ actual = input.slug
159
+
160
+ expect(actual).to eq 'collection-item'
200
161
  end
201
162
  end
202
163
 
203
- describe 'html_nodes' do
204
- it 'returns the list of all <p> by default' do
205
- expect(page_file.html_nodes.size).to eq 6
206
- end
164
+ describe 'tags' do
165
+ if restrict_jekyll_version(more_than: '3.0')
166
+ it 'should not throw a deprecation warning' do
167
+ # Given
168
+ input = fixture_post
207
169
 
208
- it 'allow _config.yml to override the selector' do
209
- # Given
210
- site = get_site(algolia: { 'record_css_selector' => 'p,ul' })
211
- page_file = extractor.new(site.file_by_name('about.md'))
170
+ # When
171
+ input.tags
212
172
 
213
- expect(page_file.html_nodes.size).to eq 7
173
+ # Then
174
+ expect(Jekyll.logger)
175
+ .to_not have_received(:warn).with('Deprecation:', any_args)
176
+ end
214
177
  end
215
- end
216
178
 
217
- describe 'node_heading_parent' do
218
- it 'returns the direct heading right above' do
179
+ it 'should get tags from page' do
219
180
  # Given
220
- nodes = hierarchy_page_file.html_nodes
221
- p = nodes[0]
181
+ input = fixture_page
222
182
 
223
183
  # When
224
- actual = hierarchy_page_file.node_heading_parent(p)
184
+ actual = input.tags
225
185
 
226
- # Then
227
- expect(actual.name).to eq 'h1'
228
- expect(actual.text).to eq 'H1'
186
+ expect(actual).to include('tag', 'another tag')
229
187
  end
230
188
 
231
- it 'returns the closest heading even if in a sub tag' do
189
+ it 'should get tags from post' do
232
190
  # Given
233
- nodes = hierarchy_page_file.html_nodes
234
- p = nodes[2]
191
+ input = fixture_post
235
192
 
236
193
  # When
237
- actual = hierarchy_page_file.node_heading_parent(p)
194
+ actual = input.tags
238
195
 
239
- # Then
240
- expect(actual.name).to eq 'h2'
241
- expect(actual.text).to eq 'H2A'
196
+ expect(actual).to include('tag', 'another tag')
242
197
  end
243
198
 
244
- it 'should automatically go up one level when indexing headings' do
199
+ it 'should get tags from document' do
245
200
  # Given
246
- site = get_site(algolia: { 'record_css_selector' => 'p,h2' })
247
- hierarchy_page_file = extractor.new(site.file_by_name('hierarchy.md'))
248
- nodes = hierarchy_page_file.html_nodes
249
- h2 = nodes[4]
201
+ input = fixture_document
250
202
 
251
203
  # When
252
- actual = hierarchy_page_file.node_heading_parent(h2)
204
+ actual = input.tags
253
205
 
254
- # Then
255
- expect(actual.name).to eq 'h1'
256
- expect(actual.text).to eq 'H1'
206
+ expect(actual).to include('tag', 'another tag')
257
207
  end
258
208
 
259
- it 'should find the correct parent when indexing deep headings' do
209
+ it 'should handle custom extended tags' do
260
210
  # Given
261
- site = get_site(algolia: { 'record_css_selector' => 'h2' })
262
- hierarchy_page_file = extractor.new(site.file_by_name('hierarchy.md'))
263
- nodes = hierarchy_page_file.html_nodes
264
- h2 = nodes[2]
211
+ extended_tags = [
212
+ double('Extended Tag', to_s: 'extended tag'),
213
+ double('Extended Tag', to_s: 'extended another tag')
214
+ ]
215
+ input = fixture_post
216
+
217
+ # Overwrite string tags with more advanced ones
218
+ if restrict_jekyll_version(less_than: '3.0')
219
+ allow(input.file).to receive(:tags) { extended_tags }
220
+ else
221
+ input.file.data['tags'] = extended_tags
222
+ end
265
223
 
266
224
  # When
267
- actual = hierarchy_page_file.node_heading_parent(h2)
225
+ actual = input.tags
268
226
 
269
- # Then
270
- expect(actual.name).to eq 'h1'
271
- expect(actual.text).to eq 'H1'
227
+ expect(actual).to include('extended tag', 'extended another tag')
272
228
  end
273
229
  end
274
230
 
275
- describe 'node_hierarchy' do
276
- it 'returns the unique parent of a simple element' do
277
- # Note: First <p> should only have a h1 as hierarchy
231
+ describe 'date' do
232
+ it 'should get the date as a timestamp for posts' do
278
233
  # Given
279
- nodes = hierarchy_page_file.html_nodes
280
- p = nodes[0]
234
+ input = fixture_post
281
235
 
282
236
  # When
283
- actual = hierarchy_page_file.node_hierarchy(p)
237
+ actual = input.date
284
238
 
285
239
  # Then
286
- expect(actual).to include(h1: 'H1')
240
+ expect(actual).to eq 1_435_788_000
287
241
  end
288
242
 
289
- it 'returns the heading hierarchy of multiple headings' do
290
- # Note: 5th <p> is inside h3, second h2 and main h1
243
+ it 'should be nil for pages' do
291
244
  # Given
292
- nodes = hierarchy_page_file.html_nodes
293
- p = nodes[4]
245
+ input = fixture_page
294
246
 
295
247
  # When
296
- actual = hierarchy_page_file.node_hierarchy(p)
248
+ actual = input.date
297
249
 
298
250
  # Then
299
- expect(actual).to include(h1: 'H1', h2: 'H2B', h3: 'H3A')
251
+ expect(actual).to eq nil
300
252
  end
301
253
 
302
- it 'works even if heading not on the same level' do
303
- # Note: The 6th <p> is inside a div
254
+ it 'should generate the timestamp relative to the configured timezone' do
304
255
  # Given
305
- nodes = hierarchy_page_file.html_nodes
306
- p = nodes[5]
256
+ site = get_site(timezone: 'America/New_York')
257
+ input = extractor.new(site.file_by_name('test-post.md'))
307
258
 
308
259
  # When
309
- actual = hierarchy_page_file.node_hierarchy(p)
260
+ actual = input.date
310
261
 
311
262
  # Then
312
- expect(actual).to include(h1: 'H1', h2: 'H2B', h3: 'H3A', h4: 'H4')
263
+ expect(actual).to eq 1_435_809_600
313
264
  end
265
+ end
314
266
 
315
- it 'includes node in the output if headings are indexed' do
267
+ describe 'collection' do
268
+ it 'should get the collection name for documents' do
316
269
  # Given
317
- site = get_site(algolia: { 'record_css_selector' => 'h1' })
318
- hierarchy_page_file = extractor.new(site.file_by_name('hierarchy.md'))
319
- nodes = hierarchy_page_file.html_nodes
320
- h1 = nodes[0]
270
+ input = fixture_document
321
271
 
322
272
  # When
323
- actual = hierarchy_page_file.node_hierarchy(h1)
273
+ actual = input.collection
324
274
 
325
275
  # Then
326
- expect(actual).to include(h1: 'H1')
276
+ expect(actual).to eq 'my-collection'
327
277
  end
328
278
 
329
- it 'escape html in headings' do
279
+ it 'should be nil for pages' do
330
280
  # Given
331
- nodes = hierarchy_page_file.html_nodes
332
- p = nodes[7]
281
+ input = fixture_page
333
282
 
334
283
  # When
335
- actual = hierarchy_page_file.node_hierarchy(p)
284
+ actual = input.collection
336
285
 
337
286
  # Then
338
- expect(actual).to include(h3: 'H3B &lt;code&gt;')
287
+ expect(actual).to eq nil
339
288
  end
340
- end
341
289
 
342
- describe 'node_raw_html' do
343
- it 'returns html including surrounding tags' do
344
- # Note: 3rd <p> is a real HTML with a custom class
290
+ it 'should be nil for posts' do
345
291
  # Given
346
- nodes = page_file.html_nodes
347
- p = nodes[3]
292
+ input = fixture_post
348
293
 
349
294
  # When
350
- actual = page_file.node_raw_html(p)
295
+ actual = input.collection
351
296
 
352
297
  # Then
353
- expect(actual).to eq '<p id="text4">Another text 4</p>'
298
+ expect(actual).to eq nil
354
299
  end
355
300
  end
356
301
 
357
- describe 'node_text' do
358
- it 'returns inner text with <> escaped' do
359
- # Note: 4th <p> contains a <code> tag with <>
302
+ describe 'front_matter' do
303
+ it 'should get a hash of all front matter data' do
360
304
  # Given
361
- nodes = page_file.html_nodes
362
- p = nodes[4]
305
+ input = fixture_front_matter
363
306
 
364
307
  # When
365
- actual = page_file.node_text(p)
308
+ actual = input.front_matter
366
309
 
367
310
  # Then
368
- expect(actual).to eq 'Another &lt;text&gt; 5'
311
+ expect(actual[:author]).to eq 'John Doe'
312
+ expect(actual[:custom]).to eq 'foo'
369
313
  end
370
- end
371
314
 
372
- describe 'unique_hierarchy' do
373
- it 'combines title and headings' do
315
+ it 'should remove known keys from the front-matter' do
374
316
  # Given
375
- hierarchy = {
376
- title: 'title',
377
- h1: 'h1',
378
- h2: 'h2',
379
- h3: 'h3',
380
- h4: 'h4',
381
- h5: 'h5',
382
- h6: 'h6'
383
- }
317
+ input = fixture_front_matter
384
318
 
385
319
  # When
386
- actual = page_file.unique_hierarchy(hierarchy)
320
+ actual = input.front_matter
387
321
 
388
322
  # Then
389
- expect(actual).to eq 'title > h1 > h2 > h3 > h4 > h5 > h6'
323
+ expect(actual[:title]).to eq nil
324
+ expect(actual[:tags]).to eq nil
325
+ expect(actual[:slug]).to eq nil
326
+ expect(actual[:url]).to eq nil
327
+ expect(actual[:date]).to eq nil
328
+ expect(actual[:type]).to eq nil
390
329
  end
391
330
 
392
- it 'combines title and headings even with missing elements' do
331
+ it 'should cast keys as symbols' do
393
332
  # Given
394
- hierarchy = {
395
- title: 'title',
396
- h2: 'h2',
397
- h4: 'h4',
398
- h6: 'h6'
399
- }
333
+ input = fixture_front_matter
400
334
 
401
335
  # When
402
- actual = page_file.unique_hierarchy(hierarchy)
336
+ actual = input.front_matter
403
337
 
404
338
  # Then
405
- expect(actual).to eq 'title > h2 > h4 > h6'
339
+ expect(actual['custom']).to eq nil
340
+ expect(actual[:custom]).to_not eq nil
341
+ expect(actual['author']).to eq nil
342
+ expect(actual[:author]).to_not eq nil
406
343
  end
407
344
  end
408
345
 
409
- describe 'node_css_selector' do
410
- it 'uses the #id to make the selector more precise if one is found' do
346
+ describe 'extract' do
347
+ it 'should get one item per node' do
411
348
  # Given
412
- nodes = page_file.html_nodes
413
- p = nodes[3]
349
+ input = fixture_only_paragraphs
414
350
 
415
351
  # When
416
- actual = page_file.node_css_selector(p)
352
+ actual = input.extract
417
353
 
418
354
  # Then
419
- expect(actual).to eq '#text4'
355
+ expect(actual.size).to eq 6
420
356
  end
421
357
 
422
- it 'uses p:nth-of-type if no #id found' do
358
+ it 'should get a complete record' do
423
359
  # Given
424
- nodes = page_file.html_nodes
425
- p = nodes[2]
360
+ input = fixture_page
426
361
 
427
362
  # When
428
- actual = page_file.node_css_selector(p)
363
+ actual = input.extract
429
364
 
430
365
  # Then
431
- expect(actual).to eq 'p:nth-of-type(3)'
366
+ # Jekyll auto-generates anchors on heading
367
+ expect(actual[0][:anchor]).to eq 'heading-1'
368
+ # It's a page, so no date
369
+ expect(actual[0][:date]).to eq nil
370
+ # Hierarchy on first level
371
+ expect(actual[0][:hierarchy][:lvl0]).to eq 'Heading 1'
372
+ expect(actual[0][:hierarchy][:lvl1]).to eq nil
373
+ # Node content
374
+ expect(actual[0][:tag_name]).to eq 'p'
375
+ expect(actual[0][:html]).to eq '<p>Text 1</p>'
376
+ expect(actual[0][:text]).to eq 'Text 1'
377
+ # Page
378
+ expect(actual[0][:title]).to eq 'About page'
379
+ expect(actual[0][:slug]).to eq 'about'
380
+ expect(actual[0][:url]).to eq '/about.html'
381
+ # Tags
382
+ expect(actual[0][:tags]).to eq ['tag', 'another tag']
383
+ # Weight
384
+ expect(actual[0][:weight][:heading]).to eq 90
385
+ expect(actual[0][:weight][:position]).to eq 0
432
386
  end
433
387
 
434
- it 'handles custom <div> markup' do
388
+ it 'should allow overriding the node selector' do
435
389
  # Given
436
- nodes = page_file.html_nodes
437
- p = nodes[5]
390
+ site = get_site(algolia: { 'record_css_selector' => 'div' })
391
+ input = extractor.new(site.file_by_name('only-divs.md'))
438
392
 
439
393
  # When
440
- actual = page_file.node_css_selector(p)
394
+ actual = input.extract
441
395
 
442
396
  # Then
443
- expect(actual).to eq 'div:nth-of-type(2) > p'
397
+ expect(actual.size).to eq 6
444
398
  end
445
- end
446
399
 
447
- describe 'weight_heading_relevance' do
448
- it 'gets the number of words in text also in the title' do
400
+ it 'should contain all the basic top level info' do
449
401
  # Given
450
- data = {
451
- title: 'foo bar',
452
- text: 'Lorem ipsum dolor foo bar, consectetur adipiscing elit'
453
- }
402
+ input = fixture_page
403
+ allow(input).to receive(:date) { 'mock_date' }
404
+ allow(input).to receive(:slug) { 'mock_slug' }
405
+ allow(input).to receive(:tags) { 'mock_tags' }
406
+ allow(input).to receive(:title) { 'mock_title' }
407
+ allow(input).to receive(:url) { 'mock_url' }
408
+ allow(input).to receive(:type) { 'mock_type' }
454
409
 
455
410
  # When
456
- actual = page_file.weight_heading_relevance(data)
411
+ actual = input.extract
457
412
 
458
413
  # Then
459
- expect(actual).to eq 2
414
+ expect(actual[0][:date]).to eq 'mock_date'
415
+ expect(actual[0][:slug]).to eq 'mock_slug'
416
+ expect(actual[0][:tags]).to eq 'mock_tags'
417
+ expect(actual[0][:title]).to eq 'mock_title'
418
+ expect(actual[0][:url]).to eq 'mock_url'
419
+ expect(actual[0][:type]).to eq 'mock_type'
460
420
  end
461
421
 
462
- it 'gets the number of words in text also in the headings' do
422
+ it 'should add node data from extractor' do
463
423
  # Given
464
- data = {
465
- title: 'foo',
466
- h1: 'bar',
467
- h2: 'baz',
468
- text: 'Lorem baz dolor foo bar, consectetur adipiscing elit'
469
- }
424
+ input = fixture_page
425
+ allow(input).to receive(:hierarchy_nodes) do
426
+ [
427
+ { name: 'foo' },
428
+ { name: 'bar' }
429
+ ]
430
+ end
470
431
 
471
432
  # When
472
- actual = page_file.weight_heading_relevance(data)
433
+ actual = input.extract
473
434
 
474
435
  # Then
475
- expect(actual).to eq 3
436
+ expect(actual[0][:name]).to eq 'foo'
476
437
  end
477
438
 
478
- it 'count each word only once' do
439
+ it 'should not expose the HTML node' do
479
440
  # Given
480
- data = {
481
- title: 'foo',
482
- h1: 'foo foo foo',
483
- h2: 'bar bar foo bar',
484
- text: 'foo bar bar bar bar baz foo bar baz'
485
- }
441
+ input = fixture_only_paragraphs
486
442
 
487
443
  # When
488
- actual = page_file.weight_heading_relevance(data)
444
+ actual = input.extract
489
445
 
490
446
  # Then
491
- expect(actual).to eq 2
447
+ expect(actual[0][:node]).to eq nil
492
448
  end
493
449
 
494
- it 'is case-insensitive' do
450
+ it 'should set the objectID as a hash' do
495
451
  # Given
496
- data = {
497
- title: 'FOO',
498
- h1: 'bar Bar BAR',
499
- text: 'foo BAR'
500
- }
452
+ input = fixture_page
501
453
 
502
454
  # When
503
- actual = page_file.weight_heading_relevance(data)
455
+ actual = input.extract
504
456
 
505
457
  # Then
506
- expect(actual).to eq 2
458
+ expect(actual[0]).not_to have_key(:uuid)
459
+ expect(actual[0]).to have_key(:objectID)
507
460
  end
508
461
 
509
- it 'should only use words, no partial matches' do
462
+ it 'should not contain a collection key for pages' do
510
463
  # Given
511
- data = {
512
- title: 'foo bar',
513
- text: 'xxxfooxxx bar'
514
- }
464
+ input = fixture_page
515
465
 
516
466
  # When
517
- actual = page_file.weight_heading_relevance(data)
467
+ actual = input.extract
518
468
 
519
469
  # Then
520
- expect(actual).to eq 1
470
+ expect(actual[0]).not_to have_key(:collection)
521
471
  end
522
472
 
523
- it 'should still work with non-string keys' do
473
+ it 'should not contain a collection key for posts' do
524
474
  # Given
525
- data = {
526
- title: nil,
527
- h1: [],
528
- h2: {},
529
- h3: true,
530
- h4: false,
531
- h5: 'foo bar',
532
- text: 'foo bar'
533
- }
475
+ input = fixture_post
534
476
 
535
477
  # When
536
- actual = page_file.weight_heading_relevance(data)
478
+ actual = input.extract
537
479
 
538
480
  # Then
539
- expect(actual).to eq 2
481
+ expect(actual[0]).not_to have_key(:collection)
540
482
  end
541
- end
542
483
 
543
- describe 'weight_tag_name' do
544
- it 'gives a score of 0 to non-headings' do
484
+ it 'should contain the collection name for documents' do
545
485
  # Given
546
- data = {
547
- tag_name: 'p'
548
- }
486
+ page = fixture_document
549
487
 
550
488
  # When
551
- actual = page_file.weight_tag_name(data)
489
+ page_data = page.extract
552
490
 
553
491
  # Then
554
- expect(actual).to eq 0
492
+ expect(page_data[0][:collection]).to eq 'my-collection'
555
493
  end
556
- it 'gives a score of 100 to h1' do
494
+
495
+ it 'should not contain a date key for pages' do
557
496
  # Given
558
- data = {
559
- tag_name: 'h1'
560
- }
497
+ input = fixture_page
561
498
 
562
499
  # When
563
- actual = page_file.weight_tag_name(data)
500
+ actual = input.extract
564
501
 
565
502
  # Then
566
- expect(actual).to eq 100
503
+ expect(actual[0]).not_to have_key(:date)
567
504
  end
568
- it 'gives a score of 40 to h6' do
505
+ end
506
+
507
+ describe 'custom_hook_each' do
508
+ it 'should be called on every item' do
569
509
  # Given
570
- data = {
571
- tag_name: 'h6'
572
- }
510
+ input = fixture_page
511
+ allow(input).to receive(:custom_hook_each).and_call_original
573
512
 
574
513
  # When
575
- actual = page_file.weight_tag_name(data)
514
+ actual = input.extract
576
515
 
577
516
  # Then
578
- expect(actual).to eq 50
517
+ expect(input).to have_received(:custom_hook_each)
518
+ .exactly(actual.size).times
579
519
  end
580
- end
581
520
 
582
- describe 'weight' do
583
- it 'returns an object with all weights' do
521
+ it 'should let users change the item' do
584
522
  # Given
585
- item = {
586
- tag_name: 'p'
587
- }
588
- allow(page_file).to receive(:weight_tag_name) { 10 }
589
- allow(page_file).to receive(:weight_heading_relevance) { 20 }
523
+ input = fixture_page
524
+ def input.custom_hook_each(item, _)
525
+ item['foo'] = 'bar'
526
+ item
527
+ end
590
528
 
591
529
  # When
592
- actual = page_file.weight(item, 42)
530
+ actual = input.extract
593
531
 
594
532
  # Then
595
- expect(actual).to include(tag_name: 10)
596
- expect(actual).to include(heading_relevance: 20)
597
- expect(actual).to include(position: 42)
533
+ expect(actual[0]['foo']).to eq 'bar'
598
534
  end
599
- end
600
535
 
601
- describe 'custom_hook_each' do
602
- it 'let the user call a custom hook to modify a record' do
536
+ it 'should let a user remove an item by returning nil' do
603
537
  # Given
604
- def page_file.custom_hook_each(item, _)
605
- item[:custom_attribute] = 'foo'
606
- item
538
+ input = fixture_page
539
+ def input.custom_hook_each(_, _)
540
+ nil
607
541
  end
608
542
 
609
543
  # When
610
- actual = page_file.extract
544
+ actual = input.extract
611
545
 
612
546
  # Then
613
- expect(actual[0]).to include(custom_attribute: 'foo')
547
+ expect(actual.size).to eq 0
614
548
  end
615
549
 
616
- it 'let the user discard a record by returning nil' do
550
+ it 'should be passed the Nokogiri node as second argument' do
617
551
  # Given
618
- def page_file.custom_hook_each(_, _)
619
- nil
552
+ input = fixture_page
553
+ def input.custom_hook_each(item, nokogiri_node)
554
+ item['foo'] = nokogiri_node
555
+ item
620
556
  end
621
557
 
622
558
  # When
623
- actual = page_file.extract
559
+ actual = input.extract
624
560
 
625
561
  # Then
626
- expect(actual.size).to eq 0
562
+ expect(actual[0]['foo']).to be_an(Nokogiri::XML::Element)
627
563
  end
628
564
  end
629
565
 
630
566
  describe 'custom_hook_all' do
631
- it 'let the user call a custom hook to modify the list of records' do
567
+ it 'should let the user update the list of records' do
632
568
  # Given
633
- def page_file.custom_hook_all(items)
634
- [items[0], { foo: 'bar' }]
569
+ input = fixture_page
570
+ def input.custom_hook_all(_)
571
+ [{
572
+ 'foo' => 'bar'
573
+ }]
635
574
  end
636
575
 
637
576
  # When
638
- actual = page_file.extract
577
+ actual = input.extract
639
578
 
640
579
  # Then
641
- expect(actual.size).to eq 2
642
- expect(actual[1]).to include(foo: 'bar')
580
+ expect(actual[0]['foo']).to eq 'bar'
643
581
  end
644
582
  end
645
583
  end