html-hierarchy-extractor 1.0.2 → 1.0.9
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- metadata +45 -48
- data/.coveralls.yml +0 -1
- data/.document +0 -5
- data/.rspec +0 -2
- data/.rubocop.yml +0 -26
- data/.travis.yml +0 -12
- data/CONTRIBUTING.md +0 -53
- data/Gemfile +0 -16
- data/Guardfile +0 -7
- data/LICENSE.txt +0 -20
- data/README.md +0 -141
- data/Rakefile +0 -58
- data/VERSION +0 -1
- data/html-hierarchy-extractor.gemspec +0 -99
- data/lib/html-hierarchy-extractor.rb +0 -144
- data/lib/version.rb +0 -6
- data/scripts/bump_version +0 -47
- data/scripts/check_flay +0 -30
- data/scripts/check_flog +0 -31
- data/scripts/coverage +0 -3
- data/scripts/git_hooks/pre-commit +0 -16
- data/scripts/git_hooks/pre-push +0 -9
- data/scripts/lint +0 -2
- data/scripts/release +0 -13
- data/scripts/test +0 -4
- data/scripts/test_ci +0 -7
- data/scripts/watch +0 -4
- data/spec/html_hierarchy_extractor_spec.rb +0 -441
- data/spec/spec_helper.rb +0 -14
- data/spec/spec_helper_simplecov.rb +0 -9
data/scripts/git_hooks/pre-push
DELETED
data/scripts/lint
DELETED
data/scripts/release
DELETED
data/scripts/test
DELETED
data/scripts/test_ci
DELETED
@@ -1,7 +0,0 @@
|
|
1
|
-
#!/usr/bin/env bash
|
2
|
-
# This script will be started by Travis, in the correct context (matrix of Ruby
|
3
|
-
# version + Gemfile version), so it only needs to load the tests, without
|
4
|
-
# worrying about appraisal
|
5
|
-
cd "$(dirname "$BASH_SOURCE")"/..
|
6
|
-
|
7
|
-
COVERAGE=1 bundle exec rspec
|
data/scripts/watch
DELETED
@@ -1,441 +0,0 @@
|
|
1
|
-
require 'spec_helper'
|
2
|
-
|
3
|
-
describe(HTMLHierarchyExtractor) do
|
4
|
-
describe 'extract' do
|
5
|
-
it 'should load from an HTML string' do
|
6
|
-
# Given
|
7
|
-
input = '<p>foo</p>'
|
8
|
-
|
9
|
-
# When
|
10
|
-
actual = HTMLHierarchyExtractor.new(input).extract
|
11
|
-
|
12
|
-
# Then
|
13
|
-
expect(actual.size).to eq 1
|
14
|
-
end
|
15
|
-
|
16
|
-
it 'should allow overriding of the default css selector of nodes' do
|
17
|
-
# Given
|
18
|
-
input = '<div>foo</div>'
|
19
|
-
|
20
|
-
# When
|
21
|
-
options = {
|
22
|
-
css_selector: 'div'
|
23
|
-
}
|
24
|
-
actual = HTMLHierarchyExtractor.new(input, options: options).extract
|
25
|
-
|
26
|
-
# Then
|
27
|
-
expect(actual.size).to eq 1
|
28
|
-
end
|
29
|
-
|
30
|
-
it 'should export the Nokogiri node' do
|
31
|
-
# Given
|
32
|
-
input = '<p>foo</p>'
|
33
|
-
|
34
|
-
# When
|
35
|
-
actual = HTMLHierarchyExtractor.new(input).extract
|
36
|
-
|
37
|
-
# Then
|
38
|
-
expect(actual[0][:node]).to be_an(Nokogiri::XML::Element)
|
39
|
-
end
|
40
|
-
|
41
|
-
it 'should remove empty elements' do
|
42
|
-
# Given
|
43
|
-
input = '<p></p>'
|
44
|
-
|
45
|
-
# When
|
46
|
-
actual = HTMLHierarchyExtractor.new(input).extract
|
47
|
-
|
48
|
-
# Then
|
49
|
-
expect(actual.size).to eq 0
|
50
|
-
end
|
51
|
-
|
52
|
-
it 'should add the DOM position to each element' do
|
53
|
-
# Given
|
54
|
-
input = '<p>foo</p>
|
55
|
-
<p>bar</p>
|
56
|
-
<p>baz</p>'
|
57
|
-
|
58
|
-
# When
|
59
|
-
actual = HTMLHierarchyExtractor.new(input).extract
|
60
|
-
|
61
|
-
# Then
|
62
|
-
expect(actual[0][:weight][:position]).to eq 0
|
63
|
-
expect(actual[1][:weight][:position]).to eq 1
|
64
|
-
expect(actual[2][:weight][:position]).to eq 2
|
65
|
-
end
|
66
|
-
end
|
67
|
-
|
68
|
-
describe 'extract_html' do
|
69
|
-
it 'should extract outer html' do
|
70
|
-
# Given
|
71
|
-
input = '<p>foo</p>'
|
72
|
-
|
73
|
-
# When
|
74
|
-
actual = HTMLHierarchyExtractor.new(input).extract
|
75
|
-
|
76
|
-
# Then
|
77
|
-
expect(actual[0][:html]).to eq '<p>foo</p>'
|
78
|
-
end
|
79
|
-
|
80
|
-
it 'should trim content' do
|
81
|
-
# Given
|
82
|
-
input = '<p>foo</p>
|
83
|
-
<blink>irrelevant</blink>'
|
84
|
-
|
85
|
-
# When
|
86
|
-
actual = HTMLHierarchyExtractor.new(input).extract
|
87
|
-
|
88
|
-
# Then
|
89
|
-
expect(actual[0][:html]).to eq '<p>foo</p>'
|
90
|
-
end
|
91
|
-
end
|
92
|
-
|
93
|
-
describe 'extract_text' do
|
94
|
-
it 'should extract inner text' do
|
95
|
-
# Given
|
96
|
-
input = '<p>foo</p>'
|
97
|
-
|
98
|
-
# When
|
99
|
-
actual = HTMLHierarchyExtractor.new(input).extract
|
100
|
-
|
101
|
-
# Then
|
102
|
-
expect(actual[0][:text]).to eq 'foo'
|
103
|
-
end
|
104
|
-
|
105
|
-
it 'should extract UTF8 correctly' do
|
106
|
-
# Given
|
107
|
-
input = '<p>UTF8‽✗✓</p>'
|
108
|
-
|
109
|
-
# When
|
110
|
-
actual = HTMLHierarchyExtractor.new(input).extract
|
111
|
-
|
112
|
-
# Then
|
113
|
-
expect(actual[0][:text]).to eq 'UTF8‽✗✓'
|
114
|
-
end
|
115
|
-
end
|
116
|
-
|
117
|
-
describe 'extract_tag_name' do
|
118
|
-
it 'should extract the tag name' do
|
119
|
-
# Given
|
120
|
-
input = '<p>foo</p>'
|
121
|
-
|
122
|
-
# When
|
123
|
-
actual = HTMLHierarchyExtractor.new(input).extract
|
124
|
-
|
125
|
-
# Then
|
126
|
-
expect(actual[0][:tag_name]).to eq 'p'
|
127
|
-
end
|
128
|
-
|
129
|
-
it 'should always return lowercase' do
|
130
|
-
# Given
|
131
|
-
input = '<P>foo</P>'
|
132
|
-
|
133
|
-
# When
|
134
|
-
actual = HTMLHierarchyExtractor.new(input).extract
|
135
|
-
|
136
|
-
# Then
|
137
|
-
expect(actual[0][:tag_name]).to eq 'p'
|
138
|
-
end
|
139
|
-
end
|
140
|
-
|
141
|
-
describe 'extract_hierarchy' do
|
142
|
-
it 'should extract a simple hierarchy' do
|
143
|
-
# Given
|
144
|
-
input = '<h1>Foo</h1>
|
145
|
-
<p>First paragraph</p>
|
146
|
-
<h2>Bar</h2>
|
147
|
-
<p>Second paragraph</p>
|
148
|
-
<h3>Baz</h3>
|
149
|
-
<p>Third paragraph</p>'
|
150
|
-
|
151
|
-
# When
|
152
|
-
actual = HTMLHierarchyExtractor.new(input).extract
|
153
|
-
|
154
|
-
# Then
|
155
|
-
expect(actual[0][:hierarchy][:lvl0]).to eq 'Foo'
|
156
|
-
expect(actual[0][:hierarchy][:lvl1]).to eq nil
|
157
|
-
expect(actual[0][:hierarchy][:lvl2]).to eq nil
|
158
|
-
|
159
|
-
expect(actual[1][:hierarchy][:lvl0]).to eq 'Foo'
|
160
|
-
expect(actual[1][:hierarchy][:lvl1]).to eq 'Bar'
|
161
|
-
expect(actual[1][:hierarchy][:lvl2]).to eq nil
|
162
|
-
|
163
|
-
expect(actual[2][:hierarchy][:lvl0]).to eq 'Foo'
|
164
|
-
expect(actual[2][:hierarchy][:lvl1]).to eq 'Bar'
|
165
|
-
expect(actual[2][:hierarchy][:lvl2]).to eq 'Baz'
|
166
|
-
end
|
167
|
-
|
168
|
-
it 'should use inner text of headings' do
|
169
|
-
# Given
|
170
|
-
input = '<h1><a href="#">Foo</a><span></span></h1>
|
171
|
-
<p>First paragraph</p>'
|
172
|
-
|
173
|
-
# When
|
174
|
-
actual = HTMLHierarchyExtractor.new(input).extract
|
175
|
-
|
176
|
-
# Then
|
177
|
-
expect(actual[0][:hierarchy][:lvl0]).to eq 'Foo'
|
178
|
-
expect(actual[0][:hierarchy][:lvl1]).to eq nil
|
179
|
-
expect(actual[0][:hierarchy][:lvl2]).to eq nil
|
180
|
-
end
|
181
|
-
|
182
|
-
it 'should handle nodes not in any hierarchy' do
|
183
|
-
# Given
|
184
|
-
input = '<p>First paragraph</p>
|
185
|
-
<h1>Foo</h1>'
|
186
|
-
|
187
|
-
# When
|
188
|
-
actual = HTMLHierarchyExtractor.new(input).extract
|
189
|
-
|
190
|
-
# Then
|
191
|
-
expect(actual[0][:hierarchy][:lvl0]).to eq nil
|
192
|
-
expect(actual[0][:hierarchy][:lvl1]).to eq nil
|
193
|
-
expect(actual[0][:hierarchy][:lvl2]).to eq nil
|
194
|
-
end
|
195
|
-
|
196
|
-
it 'should handle any number of wrappers' do
|
197
|
-
# Given
|
198
|
-
input = '<header>
|
199
|
-
<h1>Foo</h1>
|
200
|
-
<p>First paragraph</p>
|
201
|
-
</header>
|
202
|
-
<div>
|
203
|
-
<div>
|
204
|
-
<div>
|
205
|
-
<h2>Bar</h2>
|
206
|
-
<p>Second paragraph</p>
|
207
|
-
</div>
|
208
|
-
</div>
|
209
|
-
<div>
|
210
|
-
<h3>Baz</h3>
|
211
|
-
<p>Third paragraph</p>
|
212
|
-
</div>
|
213
|
-
</div>'
|
214
|
-
|
215
|
-
# When
|
216
|
-
actual = HTMLHierarchyExtractor.new(input).extract
|
217
|
-
|
218
|
-
# Then
|
219
|
-
expect(actual[0][:hierarchy][:lvl0]).to eq 'Foo'
|
220
|
-
expect(actual[0][:hierarchy][:lvl1]).to eq nil
|
221
|
-
expect(actual[0][:hierarchy][:lvl2]).to eq nil
|
222
|
-
|
223
|
-
expect(actual[1][:hierarchy][:lvl0]).to eq 'Foo'
|
224
|
-
expect(actual[1][:hierarchy][:lvl1]).to eq 'Bar'
|
225
|
-
expect(actual[1][:hierarchy][:lvl2]).to eq nil
|
226
|
-
|
227
|
-
expect(actual[2][:hierarchy][:lvl0]).to eq 'Foo'
|
228
|
-
expect(actual[2][:hierarchy][:lvl1]).to eq 'Bar'
|
229
|
-
expect(actual[2][:hierarchy][:lvl2]).to eq 'Baz'
|
230
|
-
end
|
231
|
-
end
|
232
|
-
|
233
|
-
describe 'extract_anchor' do
|
234
|
-
it 'should get the anchor of parent' do
|
235
|
-
# Given
|
236
|
-
input = '<h1 name="anchor">Foo</h1>
|
237
|
-
<p>First paragraph</p>'
|
238
|
-
|
239
|
-
# When
|
240
|
-
actual = HTMLHierarchyExtractor.new(input).extract
|
241
|
-
|
242
|
-
# Then
|
243
|
-
expect(actual[0][:anchor]).to eq 'anchor'
|
244
|
-
end
|
245
|
-
|
246
|
-
it 'should get no anchor if none found' do
|
247
|
-
# Given
|
248
|
-
input = '<h1>Foo</h1>
|
249
|
-
<p>First paragraph</p>'
|
250
|
-
|
251
|
-
# When
|
252
|
-
actual = HTMLHierarchyExtractor.new(input).extract
|
253
|
-
|
254
|
-
# Then
|
255
|
-
expect(actual[0][:anchor]).to eq nil
|
256
|
-
end
|
257
|
-
|
258
|
-
it 'should use the id as anchor if no name set' do
|
259
|
-
# Given
|
260
|
-
input = '<h1 id="anchor">Foo</h1>
|
261
|
-
<p>First paragraph</p>'
|
262
|
-
|
263
|
-
# When
|
264
|
-
actual = HTMLHierarchyExtractor.new(input).extract
|
265
|
-
|
266
|
-
# Then
|
267
|
-
expect(actual[0][:anchor]).to eq 'anchor'
|
268
|
-
end
|
269
|
-
|
270
|
-
it 'should be set to nil if no name nor id' do
|
271
|
-
# Given
|
272
|
-
input = '<h1>Foo</h1>
|
273
|
-
<p>First paragraph</p>'
|
274
|
-
|
275
|
-
# When
|
276
|
-
actual = HTMLHierarchyExtractor.new(input).extract
|
277
|
-
|
278
|
-
# Then
|
279
|
-
expect(actual[0][:anchor]).to eq nil
|
280
|
-
end
|
281
|
-
|
282
|
-
it 'should get the anchor of closest parent with an anchor' do
|
283
|
-
# Given
|
284
|
-
input = '<h1 name="anchor">Foo</h1>
|
285
|
-
<p>First paragraph</p>
|
286
|
-
<h2>Bar</h2>
|
287
|
-
<p>Second paragraph</p>
|
288
|
-
<h3 name="subanchor">Baz</h3>
|
289
|
-
<p>Third paragraph</p>'
|
290
|
-
|
291
|
-
# When
|
292
|
-
actual = HTMLHierarchyExtractor.new(input).extract
|
293
|
-
|
294
|
-
# Then
|
295
|
-
expect(actual[0][:anchor]).to eq 'anchor'
|
296
|
-
expect(actual[1][:anchor]).to eq 'anchor'
|
297
|
-
expect(actual[2][:anchor]).to eq 'subanchor'
|
298
|
-
end
|
299
|
-
|
300
|
-
it 'should get anchor even if heading not a direct parent' do
|
301
|
-
# Given
|
302
|
-
input = '<header>
|
303
|
-
<h1 name="anchor">Foo</h1>
|
304
|
-
<p>First paragraph</p>
|
305
|
-
</header>
|
306
|
-
<div>
|
307
|
-
<div>
|
308
|
-
<div>
|
309
|
-
<h2>Bar</h2>
|
310
|
-
<p>Second paragraph</p>
|
311
|
-
</div>
|
312
|
-
</div>
|
313
|
-
<div>
|
314
|
-
<h3 name="subanchor">Baz</h3>
|
315
|
-
<p>Third paragraph</p>
|
316
|
-
</div>
|
317
|
-
</div>'
|
318
|
-
|
319
|
-
# When
|
320
|
-
actual = HTMLHierarchyExtractor.new(input).extract
|
321
|
-
|
322
|
-
# Then
|
323
|
-
expect(actual[0][:anchor]).to eq 'anchor'
|
324
|
-
expect(actual[1][:anchor]).to eq 'anchor'
|
325
|
-
expect(actual[2][:anchor]).to eq 'subanchor'
|
326
|
-
end
|
327
|
-
|
328
|
-
it 'should get anchor if not directly on the header but inner element' do
|
329
|
-
# Given
|
330
|
-
input = '<h1><a name="anchor">Foo</a></h1>
|
331
|
-
<p>First paragraph</p>'
|
332
|
-
|
333
|
-
# When
|
334
|
-
actual = HTMLHierarchyExtractor.new(input).extract
|
335
|
-
|
336
|
-
# Then
|
337
|
-
expect(actual[0][:anchor]).to eq 'anchor'
|
338
|
-
end
|
339
|
-
end
|
340
|
-
|
341
|
-
describe 'uuid' do
|
342
|
-
it 'should give different uuid if different content' do
|
343
|
-
# Given
|
344
|
-
input_a = '<p>foo</p>'
|
345
|
-
input_b = '<p>bar</p>'
|
346
|
-
|
347
|
-
# When
|
348
|
-
actual_a = HTMLHierarchyExtractor.new(input_a).extract[0]
|
349
|
-
actual_b = HTMLHierarchyExtractor.new(input_b).extract[0]
|
350
|
-
|
351
|
-
# Then
|
352
|
-
expect(actual_a[:uuid]).not_to eq(actual_b[:uuid])
|
353
|
-
end
|
354
|
-
|
355
|
-
it 'should give different uuid if different HTML tag' do
|
356
|
-
# Given
|
357
|
-
input_a = '<p>foo</p>'
|
358
|
-
input_b = '<p class="bar">foo</p>'
|
359
|
-
|
360
|
-
# When
|
361
|
-
actual_a = HTMLHierarchyExtractor.new(input_a).extract[0]
|
362
|
-
actual_b = HTMLHierarchyExtractor.new(input_b).extract[0]
|
363
|
-
|
364
|
-
# Then
|
365
|
-
expect(actual_a[:uuid]).not_to eq(actual_b[:uuid])
|
366
|
-
end
|
367
|
-
|
368
|
-
it 'should give different uuid if different position in page' do
|
369
|
-
# Given
|
370
|
-
input_a = '<p>foo</p><p>bar</p>'
|
371
|
-
input_b = '<p>foo</p><p>foo again</p><p>bar</p>'
|
372
|
-
|
373
|
-
# When
|
374
|
-
actual_a = HTMLHierarchyExtractor.new(input_a).extract[1]
|
375
|
-
actual_b = HTMLHierarchyExtractor.new(input_b).extract[2]
|
376
|
-
|
377
|
-
# Then
|
378
|
-
expect(actual_a[:uuid]).not_to eq(actual_b[:uuid])
|
379
|
-
end
|
380
|
-
|
381
|
-
it 'should give different uuid if different parent header' do
|
382
|
-
# Given
|
383
|
-
input_a = '<h1 name="foo">foo</h1><p>bar</p>'
|
384
|
-
input_b = '<h1 name="bar">bar</h1><p>bar</p>'
|
385
|
-
|
386
|
-
# When
|
387
|
-
actual_a = HTMLHierarchyExtractor.new(input_a).extract[0]
|
388
|
-
actual_b = HTMLHierarchyExtractor.new(input_b).extract[0]
|
389
|
-
|
390
|
-
# Then
|
391
|
-
expect(actual_a[:uuid]).not_to eq(actual_b[:uuid])
|
392
|
-
end
|
393
|
-
|
394
|
-
it 'should always give the same uuid for the same content' do
|
395
|
-
# Given
|
396
|
-
input_a = '<h1 name="foo">foo</h1><p>bar</p>'
|
397
|
-
input_b = '<h1 name="foo">foo</h1><p>bar</p>'
|
398
|
-
|
399
|
-
# When
|
400
|
-
actual_a = HTMLHierarchyExtractor.new(input_a).extract[0]
|
401
|
-
actual_b = HTMLHierarchyExtractor.new(input_b).extract[0]
|
402
|
-
|
403
|
-
# Then
|
404
|
-
expect(actual_a[:uuid]).to eq(actual_b[:uuid])
|
405
|
-
end
|
406
|
-
end
|
407
|
-
|
408
|
-
describe 'heading_weight' do
|
409
|
-
it 'should have 100 if no heading' do
|
410
|
-
# Given
|
411
|
-
input = '<p>foo</p>'
|
412
|
-
|
413
|
-
# When
|
414
|
-
actual = HTMLHierarchyExtractor.new(input).extract
|
415
|
-
|
416
|
-
# Then
|
417
|
-
expect(actual[0][:weight][:heading]).to eq 100
|
418
|
-
end
|
419
|
-
|
420
|
-
it 'should have decreasing value under small headers' do
|
421
|
-
# Given
|
422
|
-
input = '<h1 name="one">bar</h1><p>foo</p>
|
423
|
-
<h2 name="two">bar</h2><p>foo</p>
|
424
|
-
<h3 name="three">bar</h3><p>foo</p>
|
425
|
-
<h4 name="four">bar</h4><p>foo</p>
|
426
|
-
<h5 name="five">bar</h5><p>foo</p>
|
427
|
-
<h6 name="six">bar</h6><p>foo</p>'
|
428
|
-
|
429
|
-
# When
|
430
|
-
actual = HTMLHierarchyExtractor.new(input).extract
|
431
|
-
|
432
|
-
# Then
|
433
|
-
expect(actual[0][:weight][:heading]).to eq 90
|
434
|
-
expect(actual[1][:weight][:heading]).to eq 80
|
435
|
-
expect(actual[2][:weight][:heading]).to eq 70
|
436
|
-
expect(actual[3][:weight][:heading]).to eq 60
|
437
|
-
expect(actual[4][:weight][:heading]).to eq 50
|
438
|
-
expect(actual[5][:weight][:heading]).to eq 40
|
439
|
-
end
|
440
|
-
end
|
441
|
-
end
|