html-hierarchy-extractor 1.0.2 → 1.0.9

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,9 +0,0 @@
1
- #!/usr/bin/env bash
2
-
3
- ./scripts/test || exit 1
4
-
5
- # No over-complex methods
6
- ./scripts/check_flog || exit 1
7
-
8
- # No duplication
9
- ./scripts/check_flay
@@ -1,2 +0,0 @@
1
- #!/usr/bin/env bash
2
- rubocop -F './lib/' './spec'
@@ -1,13 +0,0 @@
1
- #!/usr/bin/env bash
2
- # Stop if any command fails
3
- set -e
4
-
5
- git checkout master
6
- git pull
7
-
8
- git rebase develop
9
- bundle install
10
- rake release
11
-
12
- git checkout develop
13
- git rebase master
@@ -1,4 +0,0 @@
1
- #!/usr/bin/env bash
2
- cd "$(dirname "$BASH_SOURCE")"/..
3
-
4
- COVERAGE=1 bundle exec rspec
@@ -1,7 +0,0 @@
1
- #!/usr/bin/env bash
2
- # This script will be started by Travis, in the correct context (matrix of Ruby
3
- # version + Gemfile version), so it only needs to load the tests, without
4
- # worrying about appraisal
5
- cd "$(dirname "$BASH_SOURCE")"/..
6
-
7
- COVERAGE=1 bundle exec rspec
@@ -1,4 +0,0 @@
1
- #!/usr/bin/env bash
2
- cd "$(dirname "$BASH_SOURCE")"/..
3
-
4
- guard
@@ -1,441 +0,0 @@
1
- require 'spec_helper'
2
-
3
- describe(HTMLHierarchyExtractor) do
4
- describe 'extract' do
5
- it 'should load from an HTML string' do
6
- # Given
7
- input = '<p>foo</p>'
8
-
9
- # When
10
- actual = HTMLHierarchyExtractor.new(input).extract
11
-
12
- # Then
13
- expect(actual.size).to eq 1
14
- end
15
-
16
- it 'should allow overriding of the default css selector of nodes' do
17
- # Given
18
- input = '<div>foo</div>'
19
-
20
- # When
21
- options = {
22
- css_selector: 'div'
23
- }
24
- actual = HTMLHierarchyExtractor.new(input, options: options).extract
25
-
26
- # Then
27
- expect(actual.size).to eq 1
28
- end
29
-
30
- it 'should export the Nokogiri node' do
31
- # Given
32
- input = '<p>foo</p>'
33
-
34
- # When
35
- actual = HTMLHierarchyExtractor.new(input).extract
36
-
37
- # Then
38
- expect(actual[0][:node]).to be_an(Nokogiri::XML::Element)
39
- end
40
-
41
- it 'should remove empty elements' do
42
- # Given
43
- input = '<p></p>'
44
-
45
- # When
46
- actual = HTMLHierarchyExtractor.new(input).extract
47
-
48
- # Then
49
- expect(actual.size).to eq 0
50
- end
51
-
52
- it 'should add the DOM position to each element' do
53
- # Given
54
- input = '<p>foo</p>
55
- <p>bar</p>
56
- <p>baz</p>'
57
-
58
- # When
59
- actual = HTMLHierarchyExtractor.new(input).extract
60
-
61
- # Then
62
- expect(actual[0][:weight][:position]).to eq 0
63
- expect(actual[1][:weight][:position]).to eq 1
64
- expect(actual[2][:weight][:position]).to eq 2
65
- end
66
- end
67
-
68
- describe 'extract_html' do
69
- it 'should extract outer html' do
70
- # Given
71
- input = '<p>foo</p>'
72
-
73
- # When
74
- actual = HTMLHierarchyExtractor.new(input).extract
75
-
76
- # Then
77
- expect(actual[0][:html]).to eq '<p>foo</p>'
78
- end
79
-
80
- it 'should trim content' do
81
- # Given
82
- input = '<p>foo</p>
83
- <blink>irrelevant</blink>'
84
-
85
- # When
86
- actual = HTMLHierarchyExtractor.new(input).extract
87
-
88
- # Then
89
- expect(actual[0][:html]).to eq '<p>foo</p>'
90
- end
91
- end
92
-
93
- describe 'extract_text' do
94
- it 'should extract inner text' do
95
- # Given
96
- input = '<p>foo</p>'
97
-
98
- # When
99
- actual = HTMLHierarchyExtractor.new(input).extract
100
-
101
- # Then
102
- expect(actual[0][:text]).to eq 'foo'
103
- end
104
-
105
- it 'should extract UTF8 correctly' do
106
- # Given
107
- input = '<p>UTF8‽✗✓</p>'
108
-
109
- # When
110
- actual = HTMLHierarchyExtractor.new(input).extract
111
-
112
- # Then
113
- expect(actual[0][:text]).to eq 'UTF8‽✗✓'
114
- end
115
- end
116
-
117
- describe 'extract_tag_name' do
118
- it 'should extract the tag name' do
119
- # Given
120
- input = '<p>foo</p>'
121
-
122
- # When
123
- actual = HTMLHierarchyExtractor.new(input).extract
124
-
125
- # Then
126
- expect(actual[0][:tag_name]).to eq 'p'
127
- end
128
-
129
- it 'should always return lowercase' do
130
- # Given
131
- input = '<P>foo</P>'
132
-
133
- # When
134
- actual = HTMLHierarchyExtractor.new(input).extract
135
-
136
- # Then
137
- expect(actual[0][:tag_name]).to eq 'p'
138
- end
139
- end
140
-
141
- describe 'extract_hierarchy' do
142
- it 'should extract a simple hierarchy' do
143
- # Given
144
- input = '<h1>Foo</h1>
145
- <p>First paragraph</p>
146
- <h2>Bar</h2>
147
- <p>Second paragraph</p>
148
- <h3>Baz</h3>
149
- <p>Third paragraph</p>'
150
-
151
- # When
152
- actual = HTMLHierarchyExtractor.new(input).extract
153
-
154
- # Then
155
- expect(actual[0][:hierarchy][:lvl0]).to eq 'Foo'
156
- expect(actual[0][:hierarchy][:lvl1]).to eq nil
157
- expect(actual[0][:hierarchy][:lvl2]).to eq nil
158
-
159
- expect(actual[1][:hierarchy][:lvl0]).to eq 'Foo'
160
- expect(actual[1][:hierarchy][:lvl1]).to eq 'Bar'
161
- expect(actual[1][:hierarchy][:lvl2]).to eq nil
162
-
163
- expect(actual[2][:hierarchy][:lvl0]).to eq 'Foo'
164
- expect(actual[2][:hierarchy][:lvl1]).to eq 'Bar'
165
- expect(actual[2][:hierarchy][:lvl2]).to eq 'Baz'
166
- end
167
-
168
- it 'should use inner text of headings' do
169
- # Given
170
- input = '<h1><a href="#">Foo</a><span></span></h1>
171
- <p>First paragraph</p>'
172
-
173
- # When
174
- actual = HTMLHierarchyExtractor.new(input).extract
175
-
176
- # Then
177
- expect(actual[0][:hierarchy][:lvl0]).to eq 'Foo'
178
- expect(actual[0][:hierarchy][:lvl1]).to eq nil
179
- expect(actual[0][:hierarchy][:lvl2]).to eq nil
180
- end
181
-
182
- it 'should handle nodes not in any hierarchy' do
183
- # Given
184
- input = '<p>First paragraph</p>
185
- <h1>Foo</h1>'
186
-
187
- # When
188
- actual = HTMLHierarchyExtractor.new(input).extract
189
-
190
- # Then
191
- expect(actual[0][:hierarchy][:lvl0]).to eq nil
192
- expect(actual[0][:hierarchy][:lvl1]).to eq nil
193
- expect(actual[0][:hierarchy][:lvl2]).to eq nil
194
- end
195
-
196
- it 'should handle any number of wrappers' do
197
- # Given
198
- input = '<header>
199
- <h1>Foo</h1>
200
- <p>First paragraph</p>
201
- </header>
202
- <div>
203
- <div>
204
- <div>
205
- <h2>Bar</h2>
206
- <p>Second paragraph</p>
207
- </div>
208
- </div>
209
- <div>
210
- <h3>Baz</h3>
211
- <p>Third paragraph</p>
212
- </div>
213
- </div>'
214
-
215
- # When
216
- actual = HTMLHierarchyExtractor.new(input).extract
217
-
218
- # Then
219
- expect(actual[0][:hierarchy][:lvl0]).to eq 'Foo'
220
- expect(actual[0][:hierarchy][:lvl1]).to eq nil
221
- expect(actual[0][:hierarchy][:lvl2]).to eq nil
222
-
223
- expect(actual[1][:hierarchy][:lvl0]).to eq 'Foo'
224
- expect(actual[1][:hierarchy][:lvl1]).to eq 'Bar'
225
- expect(actual[1][:hierarchy][:lvl2]).to eq nil
226
-
227
- expect(actual[2][:hierarchy][:lvl0]).to eq 'Foo'
228
- expect(actual[2][:hierarchy][:lvl1]).to eq 'Bar'
229
- expect(actual[2][:hierarchy][:lvl2]).to eq 'Baz'
230
- end
231
- end
232
-
233
- describe 'extract_anchor' do
234
- it 'should get the anchor of parent' do
235
- # Given
236
- input = '<h1 name="anchor">Foo</h1>
237
- <p>First paragraph</p>'
238
-
239
- # When
240
- actual = HTMLHierarchyExtractor.new(input).extract
241
-
242
- # Then
243
- expect(actual[0][:anchor]).to eq 'anchor'
244
- end
245
-
246
- it 'should get no anchor if none found' do
247
- # Given
248
- input = '<h1>Foo</h1>
249
- <p>First paragraph</p>'
250
-
251
- # When
252
- actual = HTMLHierarchyExtractor.new(input).extract
253
-
254
- # Then
255
- expect(actual[0][:anchor]).to eq nil
256
- end
257
-
258
- it 'should use the id as anchor if no name set' do
259
- # Given
260
- input = '<h1 id="anchor">Foo</h1>
261
- <p>First paragraph</p>'
262
-
263
- # When
264
- actual = HTMLHierarchyExtractor.new(input).extract
265
-
266
- # Then
267
- expect(actual[0][:anchor]).to eq 'anchor'
268
- end
269
-
270
- it 'should be set to nil if no name nor id' do
271
- # Given
272
- input = '<h1>Foo</h1>
273
- <p>First paragraph</p>'
274
-
275
- # When
276
- actual = HTMLHierarchyExtractor.new(input).extract
277
-
278
- # Then
279
- expect(actual[0][:anchor]).to eq nil
280
- end
281
-
282
- it 'should get the anchor of closest parent with an anchor' do
283
- # Given
284
- input = '<h1 name="anchor">Foo</h1>
285
- <p>First paragraph</p>
286
- <h2>Bar</h2>
287
- <p>Second paragraph</p>
288
- <h3 name="subanchor">Baz</h3>
289
- <p>Third paragraph</p>'
290
-
291
- # When
292
- actual = HTMLHierarchyExtractor.new(input).extract
293
-
294
- # Then
295
- expect(actual[0][:anchor]).to eq 'anchor'
296
- expect(actual[1][:anchor]).to eq 'anchor'
297
- expect(actual[2][:anchor]).to eq 'subanchor'
298
- end
299
-
300
- it 'should get anchor even if heading not a direct parent' do
301
- # Given
302
- input = '<header>
303
- <h1 name="anchor">Foo</h1>
304
- <p>First paragraph</p>
305
- </header>
306
- <div>
307
- <div>
308
- <div>
309
- <h2>Bar</h2>
310
- <p>Second paragraph</p>
311
- </div>
312
- </div>
313
- <div>
314
- <h3 name="subanchor">Baz</h3>
315
- <p>Third paragraph</p>
316
- </div>
317
- </div>'
318
-
319
- # When
320
- actual = HTMLHierarchyExtractor.new(input).extract
321
-
322
- # Then
323
- expect(actual[0][:anchor]).to eq 'anchor'
324
- expect(actual[1][:anchor]).to eq 'anchor'
325
- expect(actual[2][:anchor]).to eq 'subanchor'
326
- end
327
-
328
- it 'should get anchor if not directly on the header but inner element' do
329
- # Given
330
- input = '<h1><a name="anchor">Foo</a></h1>
331
- <p>First paragraph</p>'
332
-
333
- # When
334
- actual = HTMLHierarchyExtractor.new(input).extract
335
-
336
- # Then
337
- expect(actual[0][:anchor]).to eq 'anchor'
338
- end
339
- end
340
-
341
- describe 'uuid' do
342
- it 'should give different uuid if different content' do
343
- # Given
344
- input_a = '<p>foo</p>'
345
- input_b = '<p>bar</p>'
346
-
347
- # When
348
- actual_a = HTMLHierarchyExtractor.new(input_a).extract[0]
349
- actual_b = HTMLHierarchyExtractor.new(input_b).extract[0]
350
-
351
- # Then
352
- expect(actual_a[:uuid]).not_to eq(actual_b[:uuid])
353
- end
354
-
355
- it 'should give different uuid if different HTML tag' do
356
- # Given
357
- input_a = '<p>foo</p>'
358
- input_b = '<p class="bar">foo</p>'
359
-
360
- # When
361
- actual_a = HTMLHierarchyExtractor.new(input_a).extract[0]
362
- actual_b = HTMLHierarchyExtractor.new(input_b).extract[0]
363
-
364
- # Then
365
- expect(actual_a[:uuid]).not_to eq(actual_b[:uuid])
366
- end
367
-
368
- it 'should give different uuid if different position in page' do
369
- # Given
370
- input_a = '<p>foo</p><p>bar</p>'
371
- input_b = '<p>foo</p><p>foo again</p><p>bar</p>'
372
-
373
- # When
374
- actual_a = HTMLHierarchyExtractor.new(input_a).extract[1]
375
- actual_b = HTMLHierarchyExtractor.new(input_b).extract[2]
376
-
377
- # Then
378
- expect(actual_a[:uuid]).not_to eq(actual_b[:uuid])
379
- end
380
-
381
- it 'should give different uuid if different parent header' do
382
- # Given
383
- input_a = '<h1 name="foo">foo</h1><p>bar</p>'
384
- input_b = '<h1 name="bar">bar</h1><p>bar</p>'
385
-
386
- # When
387
- actual_a = HTMLHierarchyExtractor.new(input_a).extract[0]
388
- actual_b = HTMLHierarchyExtractor.new(input_b).extract[0]
389
-
390
- # Then
391
- expect(actual_a[:uuid]).not_to eq(actual_b[:uuid])
392
- end
393
-
394
- it 'should always give the same uuid for the same content' do
395
- # Given
396
- input_a = '<h1 name="foo">foo</h1><p>bar</p>'
397
- input_b = '<h1 name="foo">foo</h1><p>bar</p>'
398
-
399
- # When
400
- actual_a = HTMLHierarchyExtractor.new(input_a).extract[0]
401
- actual_b = HTMLHierarchyExtractor.new(input_b).extract[0]
402
-
403
- # Then
404
- expect(actual_a[:uuid]).to eq(actual_b[:uuid])
405
- end
406
- end
407
-
408
- describe 'heading_weight' do
409
- it 'should have 100 if no heading' do
410
- # Given
411
- input = '<p>foo</p>'
412
-
413
- # When
414
- actual = HTMLHierarchyExtractor.new(input).extract
415
-
416
- # Then
417
- expect(actual[0][:weight][:heading]).to eq 100
418
- end
419
-
420
- it 'should have decreasing value under small headers' do
421
- # Given
422
- input = '<h1 name="one">bar</h1><p>foo</p>
423
- <h2 name="two">bar</h2><p>foo</p>
424
- <h3 name="three">bar</h3><p>foo</p>
425
- <h4 name="four">bar</h4><p>foo</p>
426
- <h5 name="five">bar</h5><p>foo</p>
427
- <h6 name="six">bar</h6><p>foo</p>'
428
-
429
- # When
430
- actual = HTMLHierarchyExtractor.new(input).extract
431
-
432
- # Then
433
- expect(actual[0][:weight][:heading]).to eq 90
434
- expect(actual[1][:weight][:heading]).to eq 80
435
- expect(actual[2][:weight][:heading]).to eq 70
436
- expect(actual[3][:weight][:heading]).to eq 60
437
- expect(actual[4][:weight][:heading]).to eq 50
438
- expect(actual[5][:weight][:heading]).to eq 40
439
- end
440
- end
441
- end