html-hierarchy-extractor 1.0.2 → 1.0.9

Sign up to get free protection for your applications and to get access to all the features.
@@ -1,9 +0,0 @@
1
- #!/usr/bin/env bash
2
-
3
- ./scripts/test || exit 1
4
-
5
- # No over-complex methods
6
- ./scripts/check_flog || exit 1
7
-
8
- # No duplication
9
- ./scripts/check_flay
@@ -1,2 +0,0 @@
1
- #!/usr/bin/env bash
2
- rubocop -F './lib/' './spec'
@@ -1,13 +0,0 @@
1
- #!/usr/bin/env bash
2
- # Stop if any command fails
3
- set -e
4
-
5
- git checkout master
6
- git pull
7
-
8
- git rebase develop
9
- bundle install
10
- rake release
11
-
12
- git checkout develop
13
- git rebase master
@@ -1,4 +0,0 @@
1
- #!/usr/bin/env bash
2
- cd "$(dirname "$BASH_SOURCE")"/..
3
-
4
- COVERAGE=1 bundle exec rspec
@@ -1,7 +0,0 @@
1
- #!/usr/bin/env bash
2
- # This script will be started by Travis, in the correct context (matrix of Ruby
3
- # version + Gemfile version), so it only needs to load the tests, without
4
- # worrying about appraisal
5
- cd "$(dirname "$BASH_SOURCE")"/..
6
-
7
- COVERAGE=1 bundle exec rspec
@@ -1,4 +0,0 @@
1
- #!/usr/bin/env bash
2
- cd "$(dirname "$BASH_SOURCE")"/..
3
-
4
- guard
@@ -1,441 +0,0 @@
1
- require 'spec_helper'
2
-
3
- describe(HTMLHierarchyExtractor) do
4
- describe 'extract' do
5
- it 'should load from an HTML string' do
6
- # Given
7
- input = '<p>foo</p>'
8
-
9
- # When
10
- actual = HTMLHierarchyExtractor.new(input).extract
11
-
12
- # Then
13
- expect(actual.size).to eq 1
14
- end
15
-
16
- it 'should allow overriding of the default css selector of nodes' do
17
- # Given
18
- input = '<div>foo</div>'
19
-
20
- # When
21
- options = {
22
- css_selector: 'div'
23
- }
24
- actual = HTMLHierarchyExtractor.new(input, options: options).extract
25
-
26
- # Then
27
- expect(actual.size).to eq 1
28
- end
29
-
30
- it 'should export the Nokogiri node' do
31
- # Given
32
- input = '<p>foo</p>'
33
-
34
- # When
35
- actual = HTMLHierarchyExtractor.new(input).extract
36
-
37
- # Then
38
- expect(actual[0][:node]).to be_an(Nokogiri::XML::Element)
39
- end
40
-
41
- it 'should remove empty elements' do
42
- # Given
43
- input = '<p></p>'
44
-
45
- # When
46
- actual = HTMLHierarchyExtractor.new(input).extract
47
-
48
- # Then
49
- expect(actual.size).to eq 0
50
- end
51
-
52
- it 'should add the DOM position to each element' do
53
- # Given
54
- input = '<p>foo</p>
55
- <p>bar</p>
56
- <p>baz</p>'
57
-
58
- # When
59
- actual = HTMLHierarchyExtractor.new(input).extract
60
-
61
- # Then
62
- expect(actual[0][:weight][:position]).to eq 0
63
- expect(actual[1][:weight][:position]).to eq 1
64
- expect(actual[2][:weight][:position]).to eq 2
65
- end
66
- end
67
-
68
- describe 'extract_html' do
69
- it 'should extract outer html' do
70
- # Given
71
- input = '<p>foo</p>'
72
-
73
- # When
74
- actual = HTMLHierarchyExtractor.new(input).extract
75
-
76
- # Then
77
- expect(actual[0][:html]).to eq '<p>foo</p>'
78
- end
79
-
80
- it 'should trim content' do
81
- # Given
82
- input = '<p>foo</p>
83
- <blink>irrelevant</blink>'
84
-
85
- # When
86
- actual = HTMLHierarchyExtractor.new(input).extract
87
-
88
- # Then
89
- expect(actual[0][:html]).to eq '<p>foo</p>'
90
- end
91
- end
92
-
93
- describe 'extract_text' do
94
- it 'should extract inner text' do
95
- # Given
96
- input = '<p>foo</p>'
97
-
98
- # When
99
- actual = HTMLHierarchyExtractor.new(input).extract
100
-
101
- # Then
102
- expect(actual[0][:text]).to eq 'foo'
103
- end
104
-
105
- it 'should extract UTF8 correctly' do
106
- # Given
107
- input = '<p>UTF8‽✗✓</p>'
108
-
109
- # When
110
- actual = HTMLHierarchyExtractor.new(input).extract
111
-
112
- # Then
113
- expect(actual[0][:text]).to eq 'UTF8‽✗✓'
114
- end
115
- end
116
-
117
- describe 'extract_tag_name' do
118
- it 'should extract the tag name' do
119
- # Given
120
- input = '<p>foo</p>'
121
-
122
- # When
123
- actual = HTMLHierarchyExtractor.new(input).extract
124
-
125
- # Then
126
- expect(actual[0][:tag_name]).to eq 'p'
127
- end
128
-
129
- it 'should always return lowercase' do
130
- # Given
131
- input = '<P>foo</P>'
132
-
133
- # When
134
- actual = HTMLHierarchyExtractor.new(input).extract
135
-
136
- # Then
137
- expect(actual[0][:tag_name]).to eq 'p'
138
- end
139
- end
140
-
141
- describe 'extract_hierarchy' do
142
- it 'should extract a simple hierarchy' do
143
- # Given
144
- input = '<h1>Foo</h1>
145
- <p>First paragraph</p>
146
- <h2>Bar</h2>
147
- <p>Second paragraph</p>
148
- <h3>Baz</h3>
149
- <p>Third paragraph</p>'
150
-
151
- # When
152
- actual = HTMLHierarchyExtractor.new(input).extract
153
-
154
- # Then
155
- expect(actual[0][:hierarchy][:lvl0]).to eq 'Foo'
156
- expect(actual[0][:hierarchy][:lvl1]).to eq nil
157
- expect(actual[0][:hierarchy][:lvl2]).to eq nil
158
-
159
- expect(actual[1][:hierarchy][:lvl0]).to eq 'Foo'
160
- expect(actual[1][:hierarchy][:lvl1]).to eq 'Bar'
161
- expect(actual[1][:hierarchy][:lvl2]).to eq nil
162
-
163
- expect(actual[2][:hierarchy][:lvl0]).to eq 'Foo'
164
- expect(actual[2][:hierarchy][:lvl1]).to eq 'Bar'
165
- expect(actual[2][:hierarchy][:lvl2]).to eq 'Baz'
166
- end
167
-
168
- it 'should use inner text of headings' do
169
- # Given
170
- input = '<h1><a href="#">Foo</a><span></span></h1>
171
- <p>First paragraph</p>'
172
-
173
- # When
174
- actual = HTMLHierarchyExtractor.new(input).extract
175
-
176
- # Then
177
- expect(actual[0][:hierarchy][:lvl0]).to eq 'Foo'
178
- expect(actual[0][:hierarchy][:lvl1]).to eq nil
179
- expect(actual[0][:hierarchy][:lvl2]).to eq nil
180
- end
181
-
182
- it 'should handle nodes not in any hierarchy' do
183
- # Given
184
- input = '<p>First paragraph</p>
185
- <h1>Foo</h1>'
186
-
187
- # When
188
- actual = HTMLHierarchyExtractor.new(input).extract
189
-
190
- # Then
191
- expect(actual[0][:hierarchy][:lvl0]).to eq nil
192
- expect(actual[0][:hierarchy][:lvl1]).to eq nil
193
- expect(actual[0][:hierarchy][:lvl2]).to eq nil
194
- end
195
-
196
- it 'should handle any number of wrappers' do
197
- # Given
198
- input = '<header>
199
- <h1>Foo</h1>
200
- <p>First paragraph</p>
201
- </header>
202
- <div>
203
- <div>
204
- <div>
205
- <h2>Bar</h2>
206
- <p>Second paragraph</p>
207
- </div>
208
- </div>
209
- <div>
210
- <h3>Baz</h3>
211
- <p>Third paragraph</p>
212
- </div>
213
- </div>'
214
-
215
- # When
216
- actual = HTMLHierarchyExtractor.new(input).extract
217
-
218
- # Then
219
- expect(actual[0][:hierarchy][:lvl0]).to eq 'Foo'
220
- expect(actual[0][:hierarchy][:lvl1]).to eq nil
221
- expect(actual[0][:hierarchy][:lvl2]).to eq nil
222
-
223
- expect(actual[1][:hierarchy][:lvl0]).to eq 'Foo'
224
- expect(actual[1][:hierarchy][:lvl1]).to eq 'Bar'
225
- expect(actual[1][:hierarchy][:lvl2]).to eq nil
226
-
227
- expect(actual[2][:hierarchy][:lvl0]).to eq 'Foo'
228
- expect(actual[2][:hierarchy][:lvl1]).to eq 'Bar'
229
- expect(actual[2][:hierarchy][:lvl2]).to eq 'Baz'
230
- end
231
- end
232
-
233
- describe 'extract_anchor' do
234
- it 'should get the anchor of parent' do
235
- # Given
236
- input = '<h1 name="anchor">Foo</h1>
237
- <p>First paragraph</p>'
238
-
239
- # When
240
- actual = HTMLHierarchyExtractor.new(input).extract
241
-
242
- # Then
243
- expect(actual[0][:anchor]).to eq 'anchor'
244
- end
245
-
246
- it 'should get no anchor if none found' do
247
- # Given
248
- input = '<h1>Foo</h1>
249
- <p>First paragraph</p>'
250
-
251
- # When
252
- actual = HTMLHierarchyExtractor.new(input).extract
253
-
254
- # Then
255
- expect(actual[0][:anchor]).to eq nil
256
- end
257
-
258
- it 'should use the id as anchor if no name set' do
259
- # Given
260
- input = '<h1 id="anchor">Foo</h1>
261
- <p>First paragraph</p>'
262
-
263
- # When
264
- actual = HTMLHierarchyExtractor.new(input).extract
265
-
266
- # Then
267
- expect(actual[0][:anchor]).to eq 'anchor'
268
- end
269
-
270
- it 'should be set to nil if no name nor id' do
271
- # Given
272
- input = '<h1>Foo</h1>
273
- <p>First paragraph</p>'
274
-
275
- # When
276
- actual = HTMLHierarchyExtractor.new(input).extract
277
-
278
- # Then
279
- expect(actual[0][:anchor]).to eq nil
280
- end
281
-
282
- it 'should get the anchor of closest parent with an anchor' do
283
- # Given
284
- input = '<h1 name="anchor">Foo</h1>
285
- <p>First paragraph</p>
286
- <h2>Bar</h2>
287
- <p>Second paragraph</p>
288
- <h3 name="subanchor">Baz</h3>
289
- <p>Third paragraph</p>'
290
-
291
- # When
292
- actual = HTMLHierarchyExtractor.new(input).extract
293
-
294
- # Then
295
- expect(actual[0][:anchor]).to eq 'anchor'
296
- expect(actual[1][:anchor]).to eq 'anchor'
297
- expect(actual[2][:anchor]).to eq 'subanchor'
298
- end
299
-
300
- it 'should get anchor even if heading not a direct parent' do
301
- # Given
302
- input = '<header>
303
- <h1 name="anchor">Foo</h1>
304
- <p>First paragraph</p>
305
- </header>
306
- <div>
307
- <div>
308
- <div>
309
- <h2>Bar</h2>
310
- <p>Second paragraph</p>
311
- </div>
312
- </div>
313
- <div>
314
- <h3 name="subanchor">Baz</h3>
315
- <p>Third paragraph</p>
316
- </div>
317
- </div>'
318
-
319
- # When
320
- actual = HTMLHierarchyExtractor.new(input).extract
321
-
322
- # Then
323
- expect(actual[0][:anchor]).to eq 'anchor'
324
- expect(actual[1][:anchor]).to eq 'anchor'
325
- expect(actual[2][:anchor]).to eq 'subanchor'
326
- end
327
-
328
- it 'should get anchor if not directly on the header but inner element' do
329
- # Given
330
- input = '<h1><a name="anchor">Foo</a></h1>
331
- <p>First paragraph</p>'
332
-
333
- # When
334
- actual = HTMLHierarchyExtractor.new(input).extract
335
-
336
- # Then
337
- expect(actual[0][:anchor]).to eq 'anchor'
338
- end
339
- end
340
-
341
- describe 'uuid' do
342
- it 'should give different uuid if different content' do
343
- # Given
344
- input_a = '<p>foo</p>'
345
- input_b = '<p>bar</p>'
346
-
347
- # When
348
- actual_a = HTMLHierarchyExtractor.new(input_a).extract[0]
349
- actual_b = HTMLHierarchyExtractor.new(input_b).extract[0]
350
-
351
- # Then
352
- expect(actual_a[:uuid]).not_to eq(actual_b[:uuid])
353
- end
354
-
355
- it 'should give different uuid if different HTML tag' do
356
- # Given
357
- input_a = '<p>foo</p>'
358
- input_b = '<p class="bar">foo</p>'
359
-
360
- # When
361
- actual_a = HTMLHierarchyExtractor.new(input_a).extract[0]
362
- actual_b = HTMLHierarchyExtractor.new(input_b).extract[0]
363
-
364
- # Then
365
- expect(actual_a[:uuid]).not_to eq(actual_b[:uuid])
366
- end
367
-
368
- it 'should give different uuid if different position in page' do
369
- # Given
370
- input_a = '<p>foo</p><p>bar</p>'
371
- input_b = '<p>foo</p><p>foo again</p><p>bar</p>'
372
-
373
- # When
374
- actual_a = HTMLHierarchyExtractor.new(input_a).extract[1]
375
- actual_b = HTMLHierarchyExtractor.new(input_b).extract[2]
376
-
377
- # Then
378
- expect(actual_a[:uuid]).not_to eq(actual_b[:uuid])
379
- end
380
-
381
- it 'should give different uuid if different parent header' do
382
- # Given
383
- input_a = '<h1 name="foo">foo</h1><p>bar</p>'
384
- input_b = '<h1 name="bar">bar</h1><p>bar</p>'
385
-
386
- # When
387
- actual_a = HTMLHierarchyExtractor.new(input_a).extract[0]
388
- actual_b = HTMLHierarchyExtractor.new(input_b).extract[0]
389
-
390
- # Then
391
- expect(actual_a[:uuid]).not_to eq(actual_b[:uuid])
392
- end
393
-
394
- it 'should always give the same uuid for the same content' do
395
- # Given
396
- input_a = '<h1 name="foo">foo</h1><p>bar</p>'
397
- input_b = '<h1 name="foo">foo</h1><p>bar</p>'
398
-
399
- # When
400
- actual_a = HTMLHierarchyExtractor.new(input_a).extract[0]
401
- actual_b = HTMLHierarchyExtractor.new(input_b).extract[0]
402
-
403
- # Then
404
- expect(actual_a[:uuid]).to eq(actual_b[:uuid])
405
- end
406
- end
407
-
408
- describe 'heading_weight' do
409
- it 'should have 100 if no heading' do
410
- # Given
411
- input = '<p>foo</p>'
412
-
413
- # When
414
- actual = HTMLHierarchyExtractor.new(input).extract
415
-
416
- # Then
417
- expect(actual[0][:weight][:heading]).to eq 100
418
- end
419
-
420
- it 'should have decreasing value under small headers' do
421
- # Given
422
- input = '<h1 name="one">bar</h1><p>foo</p>
423
- <h2 name="two">bar</h2><p>foo</p>
424
- <h3 name="three">bar</h3><p>foo</p>
425
- <h4 name="four">bar</h4><p>foo</p>
426
- <h5 name="five">bar</h5><p>foo</p>
427
- <h6 name="six">bar</h6><p>foo</p>'
428
-
429
- # When
430
- actual = HTMLHierarchyExtractor.new(input).extract
431
-
432
- # Then
433
- expect(actual[0][:weight][:heading]).to eq 90
434
- expect(actual[1][:weight][:heading]).to eq 80
435
- expect(actual[2][:weight][:heading]).to eq 70
436
- expect(actual[3][:weight][:heading]).to eq 60
437
- expect(actual[4][:weight][:heading]).to eq 50
438
- expect(actual[5][:weight][:heading]).to eq 40
439
- end
440
- end
441
- end