html-hierarchy-extractor 1.0.0

Sign up to get free protection for your applications and to get access to all the features.
data/lib/version.rb ADDED
@@ -0,0 +1,6 @@
1
+ # Expose gem version
2
+ class HTMLHierarchyExtractorVersion
3
+ def self.to_s
4
+ '1.0.0'
5
+ end
6
+ end
@@ -0,0 +1,47 @@
1
+ #!/usr/bin/env ruby
2
+ require_relative '../lib/version.rb'
3
+
4
+ # Simple script used to bump the version number
5
+ class BumpVersion
6
+ def initialize(*args)
7
+ @type = args[0]
8
+ unless valid_type?(@type)
9
+ puts "Invalid bump type: #{@type}"
10
+ exit 1
11
+ end
12
+ end
13
+
14
+ def valid_type?(type)
15
+ %w(major minor patch).include?(type)
16
+ end
17
+
18
+ def bump(current_version, type)
19
+ major, minor, patch = current_version.split('.').map(&:to_i)
20
+ if type == 'major'
21
+ major += 1
22
+ minor = 0
23
+ patch = 0
24
+ end
25
+ if type == 'minor'
26
+ minor += 1
27
+ patch = 0
28
+ end
29
+ patch += 1 if type == 'patch'
30
+ "#{major}.#{minor}.#{patch}"
31
+ end
32
+
33
+ def run
34
+ old_version = HTMLHierarchyExtractorVersion.to_s
35
+ new_version = bump(old_version, @type)
36
+
37
+ script_dir = File.expand_path(File.dirname(__FILE__))
38
+ file = File.join(script_dir, '../lib/version.rb')
39
+ old_content = File.read(file)
40
+ new_content = old_content.gsub(old_version, new_version)
41
+ File.write(file, new_content)
42
+
43
+ `git add #{file}`
44
+ `git commit -m "chore(bump): Version bump to #{new_version}"`
45
+ end
46
+ end
47
+ BumpVersion.new(*ARGV).run
@@ -0,0 +1,30 @@
1
+ #!/usr/bin/env ruby
2
+
3
+ MAX_SCORE = 45
4
+
5
+ flay_lines = `flay -s ./lib/`.split("\n")
6
+
7
+ errors = []
8
+ flay_lines.each_with_index do |line, index|
9
+ # Skip header
10
+ next if index < 2
11
+
12
+ pattern = /^ *(.*): (.*)/
13
+ matches = line.match(pattern)
14
+ next if matches.nil?
15
+ score = matches[1].to_f
16
+
17
+ next if score < MAX_SCORE
18
+ errors << {
19
+ score: score,
20
+ file: matches[2]
21
+ }
22
+ end
23
+
24
+ exit 0 if errors.size == 0
25
+
26
+ puts 'Flay test failed:'
27
+ errors.sort_by { |a| a[:score] }.each do |error|
28
+ puts "#{error[:score]} / #{MAX_SCORE} in #{error[:file]}"
29
+ end
30
+ exit 1
@@ -0,0 +1,31 @@
1
+ #!/usr/bin/env ruby
2
+
3
+ MAX_SCORE = 45
4
+
5
+ flog_lines = `flog ./lib/`.split("\n")
6
+
7
+ errors = []
8
+ flog_lines.each_with_index do |line, index|
9
+ # Skip header
10
+ next if index < 3
11
+
12
+ pattern = /^ *(.*): (.*) (.*):[0-9]*/
13
+ matches = line.match(pattern)
14
+ next if matches.nil?
15
+ score = matches[1].to_f
16
+
17
+ next if score < MAX_SCORE
18
+ errors << {
19
+ score: score,
20
+ method: matches[2],
21
+ file: matches[3]
22
+ }
23
+ end
24
+
25
+ exit 0 if errors.size == 0
26
+
27
+ puts 'Flog test failed:'
28
+ errors.sort_by { |a| a[:score] }.each do |error|
29
+ puts "#{error[:score]} / #{MAX_SCORE}: #{error[:method]} in #{error[:file]}"
30
+ end
31
+ exit 1
data/scripts/coverage ADDED
@@ -0,0 +1,3 @@
1
+ #!/usr/bin/env bash
2
+
3
+ COVERAGE=1 bundle exec rspec
@@ -0,0 +1,16 @@
1
+ #!/usr/bin/env bash
2
+
3
+ # Succeed fast if we did not change any ruby file
4
+ if ! git status --short | grep -q '\.rb$'; then
5
+ exit 0
6
+ fi
7
+
8
+ # Do not commit any focused or excluded tests
9
+ if grep --color -r 'spec' -E -e '^( |\t)*(fit|fdescribe|xit|xdescribe)'; then
10
+ echo '✘ You have focused and/or skipped tests'
11
+ exit 1
12
+ fi
13
+
14
+ # Match style guide
15
+ ./scripts/lint || exit 1
16
+
@@ -0,0 +1,9 @@
1
+ #!/usr/bin/env bash
2
+
3
+ ./scripts/test || exit 1
4
+
5
+ # No over-complex methods
6
+ ./scripts/check_flog || exit 1
7
+
8
+ # No duplication
9
+ ./scripts/check_flay
data/scripts/lint ADDED
@@ -0,0 +1,2 @@
1
+ #!/usr/bin/env bash
2
+ rubocop -F './lib/' './spec'
data/scripts/release ADDED
@@ -0,0 +1,16 @@
1
+ #!/usr/bin/env bash
2
+ # Stop if any command fails
3
+ set -e
4
+
5
+ git checkout master
6
+ git pull
7
+ bundle install
8
+
9
+ git rebase develop
10
+ bundle install
11
+ rake release
12
+
13
+ git checkout develop
14
+ bundle install
15
+ git rebase master
16
+ bundle install
data/scripts/test ADDED
@@ -0,0 +1,4 @@
1
+ #!/usr/bin/env bash
2
+ cd "$(dirname "$BASH_SOURCE")"/..
3
+
4
+ COVERAGE=1 bundle exec rspec
data/scripts/test_ci ADDED
@@ -0,0 +1,7 @@
1
+ #!/usr/bin/env bash
2
+ # This script will be started by Travis, in the correct context (matrix of Ruby
3
+ # version + Gemfile version), so it only needs to load the tests, without
4
+ # worrying about appraisal
5
+ cd "$(dirname "$BASH_SOURCE")"/..
6
+
7
+ COVERAGE=1 bundle exec rspec
data/scripts/watch ADDED
@@ -0,0 +1,4 @@
1
+ #!/usr/bin/env bash
2
+ cd "$(dirname "$BASH_SOURCE")"/..
3
+
4
+ guard
@@ -0,0 +1,441 @@
1
+ require 'spec_helper'
2
+
3
+ describe(HTMLHierarchyExtractor) do
4
+ describe 'extract' do
5
+ it 'should load from an HTML string' do
6
+ # Given
7
+ input = '<p>foo</p>'
8
+
9
+ # When
10
+ actual = HTMLHierarchyExtractor.new(input).extract
11
+
12
+ # Then
13
+ expect(actual.size).to eq 1
14
+ end
15
+
16
+ it 'should allow overriding of the default css selector of nodes' do
17
+ # Given
18
+ input = '<div>foo</div>'
19
+
20
+ # When
21
+ options = {
22
+ css_selector: 'div'
23
+ }
24
+ actual = HTMLHierarchyExtractor.new(input, options: options).extract
25
+
26
+ # Then
27
+ expect(actual.size).to eq 1
28
+ end
29
+
30
+ it 'should export the Nokogiri node' do
31
+ # Given
32
+ input = '<p>foo</p>'
33
+
34
+ # When
35
+ actual = HTMLHierarchyExtractor.new(input).extract
36
+
37
+ # Then
38
+ expect(actual[0][:node]).to be_an(Nokogiri::XML::Element)
39
+ end
40
+
41
+ it 'should remove empty elements' do
42
+ # Given
43
+ input = '<p></p>'
44
+
45
+ # When
46
+ actual = HTMLHierarchyExtractor.new(input).extract
47
+
48
+ # Then
49
+ expect(actual.size).to eq 0
50
+ end
51
+
52
+ it 'should add the DOM position to each element' do
53
+ # Given
54
+ input = '<p>foo</p>
55
+ <p>bar</p>
56
+ <p>baz</p>'
57
+
58
+ # When
59
+ actual = HTMLHierarchyExtractor.new(input).extract
60
+
61
+ # Then
62
+ expect(actual[0][:weight][:position]).to eq 0
63
+ expect(actual[1][:weight][:position]).to eq 1
64
+ expect(actual[2][:weight][:position]).to eq 2
65
+ end
66
+ end
67
+
68
+ describe 'extract_html' do
69
+ it 'should extract outer html' do
70
+ # Given
71
+ input = '<p>foo</p>'
72
+
73
+ # When
74
+ actual = HTMLHierarchyExtractor.new(input).extract
75
+
76
+ # Then
77
+ expect(actual[0][:html]).to eq '<p>foo</p>'
78
+ end
79
+
80
+ it 'should trim content' do
81
+ # Given
82
+ input = '<p>foo</p>
83
+ <blink>irrelevant</blink>'
84
+
85
+ # When
86
+ actual = HTMLHierarchyExtractor.new(input).extract
87
+
88
+ # Then
89
+ expect(actual[0][:html]).to eq '<p>foo</p>'
90
+ end
91
+ end
92
+
93
+ describe 'extract_text' do
94
+ it 'should extract inner text' do
95
+ # Given
96
+ input = '<p>foo</p>'
97
+
98
+ # When
99
+ actual = HTMLHierarchyExtractor.new(input).extract
100
+
101
+ # Then
102
+ expect(actual[0][:text]).to eq 'foo'
103
+ end
104
+
105
+ it 'should extract UTF8 correctly' do
106
+ # Given
107
+ input = '<p>UTF8‽✗✓</p>'
108
+
109
+ # When
110
+ actual = HTMLHierarchyExtractor.new(input).extract
111
+
112
+ # Then
113
+ expect(actual[0][:text]).to eq 'UTF8‽✗✓'
114
+ end
115
+ end
116
+
117
+ describe 'extract_tag_name' do
118
+ it 'should extract the tag name' do
119
+ # Given
120
+ input = '<p>foo</p>'
121
+
122
+ # When
123
+ actual = HTMLHierarchyExtractor.new(input).extract
124
+
125
+ # Then
126
+ expect(actual[0][:tag_name]).to eq 'p'
127
+ end
128
+
129
+ it 'should always return lowercase' do
130
+ # Given
131
+ input = '<P>foo</P>'
132
+
133
+ # When
134
+ actual = HTMLHierarchyExtractor.new(input).extract
135
+
136
+ # Then
137
+ expect(actual[0][:tag_name]).to eq 'p'
138
+ end
139
+ end
140
+
141
+ describe 'extract_hierarchy' do
142
+ it 'should extract a simple hierarchy' do
143
+ # Given
144
+ input = '<h1>Foo</h1>
145
+ <p>First paragraph</p>
146
+ <h2>Bar</h2>
147
+ <p>Second paragraph</p>
148
+ <h3>Baz</h3>
149
+ <p>Third paragraph</p>'
150
+
151
+ # When
152
+ actual = HTMLHierarchyExtractor.new(input).extract
153
+
154
+ # Then
155
+ expect(actual[0][:hierarchy][:lvl0]).to eq 'Foo'
156
+ expect(actual[0][:hierarchy][:lvl1]).to eq nil
157
+ expect(actual[0][:hierarchy][:lvl2]).to eq nil
158
+
159
+ expect(actual[1][:hierarchy][:lvl0]).to eq 'Foo'
160
+ expect(actual[1][:hierarchy][:lvl1]).to eq 'Bar'
161
+ expect(actual[1][:hierarchy][:lvl2]).to eq nil
162
+
163
+ expect(actual[2][:hierarchy][:lvl0]).to eq 'Foo'
164
+ expect(actual[2][:hierarchy][:lvl1]).to eq 'Bar'
165
+ expect(actual[2][:hierarchy][:lvl2]).to eq 'Baz'
166
+ end
167
+
168
+ it 'should use inner text of headings' do
169
+ # Given
170
+ input = '<h1><a href="#">Foo</a><span></span></h1>
171
+ <p>First paragraph</p>'
172
+
173
+ # When
174
+ actual = HTMLHierarchyExtractor.new(input).extract
175
+
176
+ # Then
177
+ expect(actual[0][:hierarchy][:lvl0]).to eq 'Foo'
178
+ expect(actual[0][:hierarchy][:lvl1]).to eq nil
179
+ expect(actual[0][:hierarchy][:lvl2]).to eq nil
180
+ end
181
+
182
+ it 'should handle nodes not in any hierarchy' do
183
+ # Given
184
+ input = '<p>First paragraph</p>
185
+ <h1>Foo</h1>'
186
+
187
+ # When
188
+ actual = HTMLHierarchyExtractor.new(input).extract
189
+
190
+ # Then
191
+ expect(actual[0][:hierarchy][:lvl0]).to eq nil
192
+ expect(actual[0][:hierarchy][:lvl1]).to eq nil
193
+ expect(actual[0][:hierarchy][:lvl2]).to eq nil
194
+ end
195
+
196
+ it 'should handle any number of wrappers' do
197
+ # Given
198
+ input = '<header>
199
+ <h1>Foo</h1>
200
+ <p>First paragraph</p>
201
+ </header>
202
+ <div>
203
+ <div>
204
+ <div>
205
+ <h2>Bar</h2>
206
+ <p>Second paragraph</p>
207
+ </div>
208
+ </div>
209
+ <div>
210
+ <h3>Baz</h3>
211
+ <p>Third paragraph</p>
212
+ </div>
213
+ </div>'
214
+
215
+ # When
216
+ actual = HTMLHierarchyExtractor.new(input).extract
217
+
218
+ # Then
219
+ expect(actual[0][:hierarchy][:lvl0]).to eq 'Foo'
220
+ expect(actual[0][:hierarchy][:lvl1]).to eq nil
221
+ expect(actual[0][:hierarchy][:lvl2]).to eq nil
222
+
223
+ expect(actual[1][:hierarchy][:lvl0]).to eq 'Foo'
224
+ expect(actual[1][:hierarchy][:lvl1]).to eq 'Bar'
225
+ expect(actual[1][:hierarchy][:lvl2]).to eq nil
226
+
227
+ expect(actual[2][:hierarchy][:lvl0]).to eq 'Foo'
228
+ expect(actual[2][:hierarchy][:lvl1]).to eq 'Bar'
229
+ expect(actual[2][:hierarchy][:lvl2]).to eq 'Baz'
230
+ end
231
+ end
232
+
233
+ describe 'extract_anchor' do
234
+ it 'should get the anchor of parent' do
235
+ # Given
236
+ input = '<h1 name="anchor">Foo</h1>
237
+ <p>First paragraph</p>'
238
+
239
+ # When
240
+ actual = HTMLHierarchyExtractor.new(input).extract
241
+
242
+ # Then
243
+ expect(actual[0][:anchor]).to eq 'anchor'
244
+ end
245
+
246
+ it 'should get no anchor if none found' do
247
+ # Given
248
+ input = '<h1>Foo</h1>
249
+ <p>First paragraph</p>'
250
+
251
+ # When
252
+ actual = HTMLHierarchyExtractor.new(input).extract
253
+
254
+ # Then
255
+ expect(actual[0][:anchor]).to eq nil
256
+ end
257
+
258
+ it 'should use the id as anchor if no name set' do
259
+ # Given
260
+ input = '<h1 id="anchor">Foo</h1>
261
+ <p>First paragraph</p>'
262
+
263
+ # When
264
+ actual = HTMLHierarchyExtractor.new(input).extract
265
+
266
+ # Then
267
+ expect(actual[0][:anchor]).to eq 'anchor'
268
+ end
269
+
270
+ it 'should be set to nil if no name nor id' do
271
+ # Given
272
+ input = '<h1>Foo</h1>
273
+ <p>First paragraph</p>'
274
+
275
+ # When
276
+ actual = HTMLHierarchyExtractor.new(input).extract
277
+
278
+ # Then
279
+ expect(actual[0][:anchor]).to eq nil
280
+ end
281
+
282
+ it 'should get the anchor of closest parent with an anchor' do
283
+ # Given
284
+ input = '<h1 name="anchor">Foo</h1>
285
+ <p>First paragraph</p>
286
+ <h2>Bar</h2>
287
+ <p>Second paragraph</p>
288
+ <h3 name="subanchor">Baz</h3>
289
+ <p>Third paragraph</p>'
290
+
291
+ # When
292
+ actual = HTMLHierarchyExtractor.new(input).extract
293
+
294
+ # Then
295
+ expect(actual[0][:anchor]).to eq 'anchor'
296
+ expect(actual[1][:anchor]).to eq 'anchor'
297
+ expect(actual[2][:anchor]).to eq 'subanchor'
298
+ end
299
+
300
+ it 'should get anchor even if heading not a direct parent' do
301
+ # Given
302
+ input = '<header>
303
+ <h1 name="anchor">Foo</h1>
304
+ <p>First paragraph</p>
305
+ </header>
306
+ <div>
307
+ <div>
308
+ <div>
309
+ <h2>Bar</h2>
310
+ <p>Second paragraph</p>
311
+ </div>
312
+ </div>
313
+ <div>
314
+ <h3 name="subanchor">Baz</h3>
315
+ <p>Third paragraph</p>
316
+ </div>
317
+ </div>'
318
+
319
+ # When
320
+ actual = HTMLHierarchyExtractor.new(input).extract
321
+
322
+ # Then
323
+ expect(actual[0][:anchor]).to eq 'anchor'
324
+ expect(actual[1][:anchor]).to eq 'anchor'
325
+ expect(actual[2][:anchor]).to eq 'subanchor'
326
+ end
327
+
328
+ it 'should get anchor if not directly on the header but inner element' do
329
+ # Given
330
+ input = '<h1><a name="anchor">Foo</a></h1>
331
+ <p>First paragraph</p>'
332
+
333
+ # When
334
+ actual = HTMLHierarchyExtractor.new(input).extract
335
+
336
+ # Then
337
+ expect(actual[0][:anchor]).to eq 'anchor'
338
+ end
339
+ end
340
+
341
+ describe 'uuid' do
342
+ it 'should give different uuid if different content' do
343
+ # Given
344
+ input_a = '<p>foo</p>'
345
+ input_b = '<p>bar</p>'
346
+
347
+ # When
348
+ actual_a = HTMLHierarchyExtractor.new(input_a).extract[0]
349
+ actual_b = HTMLHierarchyExtractor.new(input_b).extract[0]
350
+
351
+ # Then
352
+ expect(actual_a[:uuid]).not_to eq(actual_b[:uuid])
353
+ end
354
+
355
+ it 'should give different uuid if different HTML tag' do
356
+ # Given
357
+ input_a = '<p>foo</p>'
358
+ input_b = '<p class="bar">foo</p>'
359
+
360
+ # When
361
+ actual_a = HTMLHierarchyExtractor.new(input_a).extract[0]
362
+ actual_b = HTMLHierarchyExtractor.new(input_b).extract[0]
363
+
364
+ # Then
365
+ expect(actual_a[:uuid]).not_to eq(actual_b[:uuid])
366
+ end
367
+
368
+ it 'should give different uuid if different position in page' do
369
+ # Given
370
+ input_a = '<p>foo</p><p>bar</p>'
371
+ input_b = '<p>foo</p><p>foo again</p><p>bar</p>'
372
+
373
+ # When
374
+ actual_a = HTMLHierarchyExtractor.new(input_a).extract[1]
375
+ actual_b = HTMLHierarchyExtractor.new(input_b).extract[2]
376
+
377
+ # Then
378
+ expect(actual_a[:uuid]).not_to eq(actual_b[:uuid])
379
+ end
380
+
381
+ it 'should give different uuid if different parent header' do
382
+ # Given
383
+ input_a = '<h1 name="foo">foo</h1><p>bar</p>'
384
+ input_b = '<h1 name="bar">bar</h1><p>bar</p>'
385
+
386
+ # When
387
+ actual_a = HTMLHierarchyExtractor.new(input_a).extract[0]
388
+ actual_b = HTMLHierarchyExtractor.new(input_b).extract[0]
389
+
390
+ # Then
391
+ expect(actual_a[:uuid]).not_to eq(actual_b[:uuid])
392
+ end
393
+
394
+ it 'should always give the same uuid for the same content' do
395
+ # Given
396
+ input_a = '<h1 name="foo">foo</h1><p>bar</p>'
397
+ input_b = '<h1 name="foo">foo</h1><p>bar</p>'
398
+
399
+ # When
400
+ actual_a = HTMLHierarchyExtractor.new(input_a).extract[0]
401
+ actual_b = HTMLHierarchyExtractor.new(input_b).extract[0]
402
+
403
+ # Then
404
+ expect(actual_a[:uuid]).to eq(actual_b[:uuid])
405
+ end
406
+ end
407
+
408
+ describe 'heading_weight' do
409
+ it 'should have 100 if no heading' do
410
+ # Given
411
+ input = '<p>foo</p>'
412
+
413
+ # When
414
+ actual = HTMLHierarchyExtractor.new(input).extract
415
+
416
+ # Then
417
+ expect(actual[0][:weight][:heading]).to eq 100
418
+ end
419
+
420
+ it 'should have decreasing value under small headers' do
421
+ # Given
422
+ input = '<h1 name="one">bar</h1><p>foo</p>
423
+ <h2 name="two">bar</h2><p>foo</p>
424
+ <h3 name="three">bar</h3><p>foo</p>
425
+ <h4 name="four">bar</h4><p>foo</p>
426
+ <h5 name="five">bar</h5><p>foo</p>
427
+ <h6 name="six">bar</h6><p>foo</p>'
428
+
429
+ # When
430
+ actual = HTMLHierarchyExtractor.new(input).extract
431
+
432
+ # Then
433
+ expect(actual[0][:weight][:heading]).to eq 90
434
+ expect(actual[1][:weight][:heading]).to eq 80
435
+ expect(actual[2][:weight][:heading]).to eq 70
436
+ expect(actual[3][:weight][:heading]).to eq 60
437
+ expect(actual[4][:weight][:heading]).to eq 50
438
+ expect(actual[5][:weight][:heading]).to eq 40
439
+ end
440
+ end
441
+ end