html-hierarchy-extractor 1.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
data/lib/version.rb ADDED
@@ -0,0 +1,6 @@
1
+ # Expose gem version
2
+ class HTMLHierarchyExtractorVersion
3
+ def self.to_s
4
+ '1.0.0'
5
+ end
6
+ end
@@ -0,0 +1,47 @@
1
+ #!/usr/bin/env ruby
2
+ require_relative '../lib/version.rb'
3
+
4
+ # Simple script used to bump the version number
5
+ class BumpVersion
6
+ def initialize(*args)
7
+ @type = args[0]
8
+ unless valid_type?(@type)
9
+ puts "Invalid bump type: #{@type}"
10
+ exit 1
11
+ end
12
+ end
13
+
14
+ def valid_type?(type)
15
+ %w(major minor patch).include?(type)
16
+ end
17
+
18
+ def bump(current_version, type)
19
+ major, minor, patch = current_version.split('.').map(&:to_i)
20
+ if type == 'major'
21
+ major += 1
22
+ minor = 0
23
+ patch = 0
24
+ end
25
+ if type == 'minor'
26
+ minor += 1
27
+ patch = 0
28
+ end
29
+ patch += 1 if type == 'patch'
30
+ "#{major}.#{minor}.#{patch}"
31
+ end
32
+
33
+ def run
34
+ old_version = HTMLHierarchyExtractorVersion.to_s
35
+ new_version = bump(old_version, @type)
36
+
37
+ script_dir = File.expand_path(File.dirname(__FILE__))
38
+ file = File.join(script_dir, '../lib/version.rb')
39
+ old_content = File.read(file)
40
+ new_content = old_content.gsub(old_version, new_version)
41
+ File.write(file, new_content)
42
+
43
+ `git add #{file}`
44
+ `git commit -m "chore(bump): Version bump to #{new_version}"`
45
+ end
46
+ end
47
+ BumpVersion.new(*ARGV).run
@@ -0,0 +1,30 @@
1
+ #!/usr/bin/env ruby
2
+
3
+ MAX_SCORE = 45
4
+
5
+ flay_lines = `flay -s ./lib/`.split("\n")
6
+
7
+ errors = []
8
+ flay_lines.each_with_index do |line, index|
9
+ # Skip header
10
+ next if index < 2
11
+
12
+ pattern = /^ *(.*): (.*)/
13
+ matches = line.match(pattern)
14
+ next if matches.nil?
15
+ score = matches[1].to_f
16
+
17
+ next if score < MAX_SCORE
18
+ errors << {
19
+ score: score,
20
+ file: matches[2]
21
+ }
22
+ end
23
+
24
+ exit 0 if errors.size == 0
25
+
26
+ puts 'Flay test failed:'
27
+ errors.sort_by { |a| a[:score] }.each do |error|
28
+ puts "#{error[:score]} / #{MAX_SCORE} in #{error[:file]}"
29
+ end
30
+ exit 1
@@ -0,0 +1,31 @@
1
+ #!/usr/bin/env ruby
2
+
3
+ MAX_SCORE = 45
4
+
5
+ flog_lines = `flog ./lib/`.split("\n")
6
+
7
+ errors = []
8
+ flog_lines.each_with_index do |line, index|
9
+ # Skip header
10
+ next if index < 3
11
+
12
+ pattern = /^ *(.*): (.*) (.*):[0-9]*/
13
+ matches = line.match(pattern)
14
+ next if matches.nil?
15
+ score = matches[1].to_f
16
+
17
+ next if score < MAX_SCORE
18
+ errors << {
19
+ score: score,
20
+ method: matches[2],
21
+ file: matches[3]
22
+ }
23
+ end
24
+
25
+ exit 0 if errors.size == 0
26
+
27
+ puts 'Flog test failed:'
28
+ errors.sort_by { |a| a[:score] }.each do |error|
29
+ puts "#{error[:score]} / #{MAX_SCORE}: #{error[:method]} in #{error[:file]}"
30
+ end
31
+ exit 1
data/scripts/coverage ADDED
@@ -0,0 +1,3 @@
1
+ #!/usr/bin/env bash
2
+
3
+ COVERAGE=1 bundle exec rspec
@@ -0,0 +1,16 @@
1
+ #!/usr/bin/env bash
2
+
3
+ # Succeed fast if we did not change any ruby file
4
+ if ! git status --short | grep -q '\.rb$'; then
5
+ exit 0
6
+ fi
7
+
8
+ # Do not commit any focused or excluded tests
9
+ if grep --color -r 'spec' -E -e '^( |\t)*(fit|fdescribe|xit|xdescribe)'; then
10
+ echo '✘ You have focused and/or skipped tests'
11
+ exit 1
12
+ fi
13
+
14
+ # Match style guide
15
+ ./scripts/lint || exit 1
16
+
@@ -0,0 +1,9 @@
1
+ #!/usr/bin/env bash
2
+
3
+ ./scripts/test || exit 1
4
+
5
+ # No over-complex methods
6
+ ./scripts/check_flog || exit 1
7
+
8
+ # No duplication
9
+ ./scripts/check_flay
data/scripts/lint ADDED
@@ -0,0 +1,2 @@
1
+ #!/usr/bin/env bash
2
+ rubocop -F './lib/' './spec'
data/scripts/release ADDED
@@ -0,0 +1,16 @@
1
+ #!/usr/bin/env bash
2
+ # Stop if any command fails
3
+ set -e
4
+
5
+ git checkout master
6
+ git pull
7
+ bundle install
8
+
9
+ git rebase develop
10
+ bundle install
11
+ rake release
12
+
13
+ git checkout develop
14
+ bundle install
15
+ git rebase master
16
+ bundle install
data/scripts/test ADDED
@@ -0,0 +1,4 @@
1
+ #!/usr/bin/env bash
2
+ cd "$(dirname "$BASH_SOURCE")"/..
3
+
4
+ COVERAGE=1 bundle exec rspec
data/scripts/test_ci ADDED
@@ -0,0 +1,7 @@
1
+ #!/usr/bin/env bash
2
+ # This script will be started by Travis, in the correct context (matrix of Ruby
3
+ # version + Gemfile version), so it only needs to load the tests, without
4
+ # worrying about appraisal
5
+ cd "$(dirname "$BASH_SOURCE")"/..
6
+
7
+ COVERAGE=1 bundle exec rspec
data/scripts/watch ADDED
@@ -0,0 +1,4 @@
1
+ #!/usr/bin/env bash
2
+ cd "$(dirname "$BASH_SOURCE")"/..
3
+
4
+ guard
@@ -0,0 +1,441 @@
1
+ require 'spec_helper'
2
+
3
+ describe(HTMLHierarchyExtractor) do
4
+ describe 'extract' do
5
+ it 'should load from an HTML string' do
6
+ # Given
7
+ input = '<p>foo</p>'
8
+
9
+ # When
10
+ actual = HTMLHierarchyExtractor.new(input).extract
11
+
12
+ # Then
13
+ expect(actual.size).to eq 1
14
+ end
15
+
16
+ it 'should allow overriding of the default css selector of nodes' do
17
+ # Given
18
+ input = '<div>foo</div>'
19
+
20
+ # When
21
+ options = {
22
+ css_selector: 'div'
23
+ }
24
+ actual = HTMLHierarchyExtractor.new(input, options: options).extract
25
+
26
+ # Then
27
+ expect(actual.size).to eq 1
28
+ end
29
+
30
+ it 'should export the Nokogiri node' do
31
+ # Given
32
+ input = '<p>foo</p>'
33
+
34
+ # When
35
+ actual = HTMLHierarchyExtractor.new(input).extract
36
+
37
+ # Then
38
+ expect(actual[0][:node]).to be_an(Nokogiri::XML::Element)
39
+ end
40
+
41
+ it 'should remove empty elements' do
42
+ # Given
43
+ input = '<p></p>'
44
+
45
+ # When
46
+ actual = HTMLHierarchyExtractor.new(input).extract
47
+
48
+ # Then
49
+ expect(actual.size).to eq 0
50
+ end
51
+
52
+ it 'should add the DOM position to each element' do
53
+ # Given
54
+ input = '<p>foo</p>
55
+ <p>bar</p>
56
+ <p>baz</p>'
57
+
58
+ # When
59
+ actual = HTMLHierarchyExtractor.new(input).extract
60
+
61
+ # Then
62
+ expect(actual[0][:weight][:position]).to eq 0
63
+ expect(actual[1][:weight][:position]).to eq 1
64
+ expect(actual[2][:weight][:position]).to eq 2
65
+ end
66
+ end
67
+
68
+ describe 'extract_html' do
69
+ it 'should extract outer html' do
70
+ # Given
71
+ input = '<p>foo</p>'
72
+
73
+ # When
74
+ actual = HTMLHierarchyExtractor.new(input).extract
75
+
76
+ # Then
77
+ expect(actual[0][:html]).to eq '<p>foo</p>'
78
+ end
79
+
80
+ it 'should trim content' do
81
+ # Given
82
+ input = '<p>foo</p>
83
+ <blink>irrelevant</blink>'
84
+
85
+ # When
86
+ actual = HTMLHierarchyExtractor.new(input).extract
87
+
88
+ # Then
89
+ expect(actual[0][:html]).to eq '<p>foo</p>'
90
+ end
91
+ end
92
+
93
+ describe 'extract_text' do
94
+ it 'should extract inner text' do
95
+ # Given
96
+ input = '<p>foo</p>'
97
+
98
+ # When
99
+ actual = HTMLHierarchyExtractor.new(input).extract
100
+
101
+ # Then
102
+ expect(actual[0][:text]).to eq 'foo'
103
+ end
104
+
105
+ it 'should extract UTF8 correctly' do
106
+ # Given
107
+ input = '<p>UTF8‽✗✓</p>'
108
+
109
+ # When
110
+ actual = HTMLHierarchyExtractor.new(input).extract
111
+
112
+ # Then
113
+ expect(actual[0][:text]).to eq 'UTF8‽✗✓'
114
+ end
115
+ end
116
+
117
+ describe 'extract_tag_name' do
118
+ it 'should extract the tag name' do
119
+ # Given
120
+ input = '<p>foo</p>'
121
+
122
+ # When
123
+ actual = HTMLHierarchyExtractor.new(input).extract
124
+
125
+ # Then
126
+ expect(actual[0][:tag_name]).to eq 'p'
127
+ end
128
+
129
+ it 'should always return lowercase' do
130
+ # Given
131
+ input = '<P>foo</P>'
132
+
133
+ # When
134
+ actual = HTMLHierarchyExtractor.new(input).extract
135
+
136
+ # Then
137
+ expect(actual[0][:tag_name]).to eq 'p'
138
+ end
139
+ end
140
+
141
+ describe 'extract_hierarchy' do
142
+ it 'should extract a simple hierarchy' do
143
+ # Given
144
+ input = '<h1>Foo</h1>
145
+ <p>First paragraph</p>
146
+ <h2>Bar</h2>
147
+ <p>Second paragraph</p>
148
+ <h3>Baz</h3>
149
+ <p>Third paragraph</p>'
150
+
151
+ # When
152
+ actual = HTMLHierarchyExtractor.new(input).extract
153
+
154
+ # Then
155
+ expect(actual[0][:hierarchy][:lvl0]).to eq 'Foo'
156
+ expect(actual[0][:hierarchy][:lvl1]).to eq nil
157
+ expect(actual[0][:hierarchy][:lvl2]).to eq nil
158
+
159
+ expect(actual[1][:hierarchy][:lvl0]).to eq 'Foo'
160
+ expect(actual[1][:hierarchy][:lvl1]).to eq 'Bar'
161
+ expect(actual[1][:hierarchy][:lvl2]).to eq nil
162
+
163
+ expect(actual[2][:hierarchy][:lvl0]).to eq 'Foo'
164
+ expect(actual[2][:hierarchy][:lvl1]).to eq 'Bar'
165
+ expect(actual[2][:hierarchy][:lvl2]).to eq 'Baz'
166
+ end
167
+
168
+ it 'should use inner text of headings' do
169
+ # Given
170
+ input = '<h1><a href="#">Foo</a><span></span></h1>
171
+ <p>First paragraph</p>'
172
+
173
+ # When
174
+ actual = HTMLHierarchyExtractor.new(input).extract
175
+
176
+ # Then
177
+ expect(actual[0][:hierarchy][:lvl0]).to eq 'Foo'
178
+ expect(actual[0][:hierarchy][:lvl1]).to eq nil
179
+ expect(actual[0][:hierarchy][:lvl2]).to eq nil
180
+ end
181
+
182
+ it 'should handle nodes not in any hierarchy' do
183
+ # Given
184
+ input = '<p>First paragraph</p>
185
+ <h1>Foo</h1>'
186
+
187
+ # When
188
+ actual = HTMLHierarchyExtractor.new(input).extract
189
+
190
+ # Then
191
+ expect(actual[0][:hierarchy][:lvl0]).to eq nil
192
+ expect(actual[0][:hierarchy][:lvl1]).to eq nil
193
+ expect(actual[0][:hierarchy][:lvl2]).to eq nil
194
+ end
195
+
196
+ it 'should handle any number of wrappers' do
197
+ # Given
198
+ input = '<header>
199
+ <h1>Foo</h1>
200
+ <p>First paragraph</p>
201
+ </header>
202
+ <div>
203
+ <div>
204
+ <div>
205
+ <h2>Bar</h2>
206
+ <p>Second paragraph</p>
207
+ </div>
208
+ </div>
209
+ <div>
210
+ <h3>Baz</h3>
211
+ <p>Third paragraph</p>
212
+ </div>
213
+ </div>'
214
+
215
+ # When
216
+ actual = HTMLHierarchyExtractor.new(input).extract
217
+
218
+ # Then
219
+ expect(actual[0][:hierarchy][:lvl0]).to eq 'Foo'
220
+ expect(actual[0][:hierarchy][:lvl1]).to eq nil
221
+ expect(actual[0][:hierarchy][:lvl2]).to eq nil
222
+
223
+ expect(actual[1][:hierarchy][:lvl0]).to eq 'Foo'
224
+ expect(actual[1][:hierarchy][:lvl1]).to eq 'Bar'
225
+ expect(actual[1][:hierarchy][:lvl2]).to eq nil
226
+
227
+ expect(actual[2][:hierarchy][:lvl0]).to eq 'Foo'
228
+ expect(actual[2][:hierarchy][:lvl1]).to eq 'Bar'
229
+ expect(actual[2][:hierarchy][:lvl2]).to eq 'Baz'
230
+ end
231
+ end
232
+
233
+ describe 'extract_anchor' do
234
+ it 'should get the anchor of parent' do
235
+ # Given
236
+ input = '<h1 name="anchor">Foo</h1>
237
+ <p>First paragraph</p>'
238
+
239
+ # When
240
+ actual = HTMLHierarchyExtractor.new(input).extract
241
+
242
+ # Then
243
+ expect(actual[0][:anchor]).to eq 'anchor'
244
+ end
245
+
246
+ it 'should get no anchor if none found' do
247
+ # Given
248
+ input = '<h1>Foo</h1>
249
+ <p>First paragraph</p>'
250
+
251
+ # When
252
+ actual = HTMLHierarchyExtractor.new(input).extract
253
+
254
+ # Then
255
+ expect(actual[0][:anchor]).to eq nil
256
+ end
257
+
258
+ it 'should use the id as anchor if no name set' do
259
+ # Given
260
+ input = '<h1 id="anchor">Foo</h1>
261
+ <p>First paragraph</p>'
262
+
263
+ # When
264
+ actual = HTMLHierarchyExtractor.new(input).extract
265
+
266
+ # Then
267
+ expect(actual[0][:anchor]).to eq 'anchor'
268
+ end
269
+
270
+ it 'should be set to nil if no name nor id' do
271
+ # Given
272
+ input = '<h1>Foo</h1>
273
+ <p>First paragraph</p>'
274
+
275
+ # When
276
+ actual = HTMLHierarchyExtractor.new(input).extract
277
+
278
+ # Then
279
+ expect(actual[0][:anchor]).to eq nil
280
+ end
281
+
282
+ it 'should get the anchor of closest parent with an anchor' do
283
+ # Given
284
+ input = '<h1 name="anchor">Foo</h1>
285
+ <p>First paragraph</p>
286
+ <h2>Bar</h2>
287
+ <p>Second paragraph</p>
288
+ <h3 name="subanchor">Baz</h3>
289
+ <p>Third paragraph</p>'
290
+
291
+ # When
292
+ actual = HTMLHierarchyExtractor.new(input).extract
293
+
294
+ # Then
295
+ expect(actual[0][:anchor]).to eq 'anchor'
296
+ expect(actual[1][:anchor]).to eq 'anchor'
297
+ expect(actual[2][:anchor]).to eq 'subanchor'
298
+ end
299
+
300
+ it 'should get anchor even if heading not a direct parent' do
301
+ # Given
302
+ input = '<header>
303
+ <h1 name="anchor">Foo</h1>
304
+ <p>First paragraph</p>
305
+ </header>
306
+ <div>
307
+ <div>
308
+ <div>
309
+ <h2>Bar</h2>
310
+ <p>Second paragraph</p>
311
+ </div>
312
+ </div>
313
+ <div>
314
+ <h3 name="subanchor">Baz</h3>
315
+ <p>Third paragraph</p>
316
+ </div>
317
+ </div>'
318
+
319
+ # When
320
+ actual = HTMLHierarchyExtractor.new(input).extract
321
+
322
+ # Then
323
+ expect(actual[0][:anchor]).to eq 'anchor'
324
+ expect(actual[1][:anchor]).to eq 'anchor'
325
+ expect(actual[2][:anchor]).to eq 'subanchor'
326
+ end
327
+
328
+ it 'should get anchor if not directly on the header but inner element' do
329
+ # Given
330
+ input = '<h1><a name="anchor">Foo</a></h1>
331
+ <p>First paragraph</p>'
332
+
333
+ # When
334
+ actual = HTMLHierarchyExtractor.new(input).extract
335
+
336
+ # Then
337
+ expect(actual[0][:anchor]).to eq 'anchor'
338
+ end
339
+ end
340
+
341
+ describe 'uuid' do
342
+ it 'should give different uuid if different content' do
343
+ # Given
344
+ input_a = '<p>foo</p>'
345
+ input_b = '<p>bar</p>'
346
+
347
+ # When
348
+ actual_a = HTMLHierarchyExtractor.new(input_a).extract[0]
349
+ actual_b = HTMLHierarchyExtractor.new(input_b).extract[0]
350
+
351
+ # Then
352
+ expect(actual_a[:uuid]).not_to eq(actual_b[:uuid])
353
+ end
354
+
355
+ it 'should give different uuid if different HTML tag' do
356
+ # Given
357
+ input_a = '<p>foo</p>'
358
+ input_b = '<p class="bar">foo</p>'
359
+
360
+ # When
361
+ actual_a = HTMLHierarchyExtractor.new(input_a).extract[0]
362
+ actual_b = HTMLHierarchyExtractor.new(input_b).extract[0]
363
+
364
+ # Then
365
+ expect(actual_a[:uuid]).not_to eq(actual_b[:uuid])
366
+ end
367
+
368
+ it 'should give different uuid if different position in page' do
369
+ # Given
370
+ input_a = '<p>foo</p><p>bar</p>'
371
+ input_b = '<p>foo</p><p>foo again</p><p>bar</p>'
372
+
373
+ # When
374
+ actual_a = HTMLHierarchyExtractor.new(input_a).extract[1]
375
+ actual_b = HTMLHierarchyExtractor.new(input_b).extract[2]
376
+
377
+ # Then
378
+ expect(actual_a[:uuid]).not_to eq(actual_b[:uuid])
379
+ end
380
+
381
+ it 'should give different uuid if different parent header' do
382
+ # Given
383
+ input_a = '<h1 name="foo">foo</h1><p>bar</p>'
384
+ input_b = '<h1 name="bar">bar</h1><p>bar</p>'
385
+
386
+ # When
387
+ actual_a = HTMLHierarchyExtractor.new(input_a).extract[0]
388
+ actual_b = HTMLHierarchyExtractor.new(input_b).extract[0]
389
+
390
+ # Then
391
+ expect(actual_a[:uuid]).not_to eq(actual_b[:uuid])
392
+ end
393
+
394
+ it 'should always give the same uuid for the same content' do
395
+ # Given
396
+ input_a = '<h1 name="foo">foo</h1><p>bar</p>'
397
+ input_b = '<h1 name="foo">foo</h1><p>bar</p>'
398
+
399
+ # When
400
+ actual_a = HTMLHierarchyExtractor.new(input_a).extract[0]
401
+ actual_b = HTMLHierarchyExtractor.new(input_b).extract[0]
402
+
403
+ # Then
404
+ expect(actual_a[:uuid]).to eq(actual_b[:uuid])
405
+ end
406
+ end
407
+
408
+ describe 'heading_weight' do
409
+ it 'should have 100 if no heading' do
410
+ # Given
411
+ input = '<p>foo</p>'
412
+
413
+ # When
414
+ actual = HTMLHierarchyExtractor.new(input).extract
415
+
416
+ # Then
417
+ expect(actual[0][:weight][:heading]).to eq 100
418
+ end
419
+
420
+ it 'should have decreasing value under small headers' do
421
+ # Given
422
+ input = '<h1 name="one">bar</h1><p>foo</p>
423
+ <h2 name="two">bar</h2><p>foo</p>
424
+ <h3 name="three">bar</h3><p>foo</p>
425
+ <h4 name="four">bar</h4><p>foo</p>
426
+ <h5 name="five">bar</h5><p>foo</p>
427
+ <h6 name="six">bar</h6><p>foo</p>'
428
+
429
+ # When
430
+ actual = HTMLHierarchyExtractor.new(input).extract
431
+
432
+ # Then
433
+ expect(actual[0][:weight][:heading]).to eq 90
434
+ expect(actual[1][:weight][:heading]).to eq 80
435
+ expect(actual[2][:weight][:heading]).to eq 70
436
+ expect(actual[3][:weight][:heading]).to eq 60
437
+ expect(actual[4][:weight][:heading]).to eq 50
438
+ expect(actual[5][:weight][:heading]).to eq 40
439
+ end
440
+ end
441
+ end