govspeak 1.3.0 → 1.4.0
Sign up to get free protection for your applications and to get access to all the features.
- data/CHANGELOG.md +4 -0
- data/lib/govspeak.rb +5 -0
- data/lib/govspeak/structured_header_extractor.rb +97 -0
- data/lib/govspeak/version.rb +1 -1
- data/test/govspeak_structured_headers_test.rb +103 -0
- metadata +8 -4
data/CHANGELOG.md
ADDED
data/lib/govspeak.rb
CHANGED
@@ -1,5 +1,6 @@
|
|
1
1
|
require 'kramdown'
|
2
2
|
require 'govspeak/header_extractor'
|
3
|
+
require 'govspeak/structured_header_extractor'
|
3
4
|
require 'govspeak/html_validator'
|
4
5
|
require 'govspeak/html_sanitizer'
|
5
6
|
require 'kramdown/parser/kramdown_with_automatic_external_links'
|
@@ -55,6 +56,10 @@ module Govspeak
|
|
55
56
|
Govspeak::HeaderExtractor.convert(kramdown_doc).first
|
56
57
|
end
|
57
58
|
|
59
|
+
def structured_headers
|
60
|
+
Govspeak::StructuredHeaderExtractor.new(self).call
|
61
|
+
end
|
62
|
+
|
58
63
|
def preprocess(source)
|
59
64
|
@@extensions.each do |title,regexp,block|
|
60
65
|
source.gsub!(regexp) {|match|
|
@@ -0,0 +1,97 @@
|
|
1
|
+
module Govspeak
|
2
|
+
|
3
|
+
StructuredHeader = Struct.new(:text, :level, :id, :headers) do
|
4
|
+
def top_level
|
5
|
+
2
|
6
|
+
end
|
7
|
+
|
8
|
+
def top_level?
|
9
|
+
level == top_level
|
10
|
+
end
|
11
|
+
end
|
12
|
+
|
13
|
+
class StructuredHeaderExtractor
|
14
|
+
def initialize(document)
|
15
|
+
@doc = document
|
16
|
+
@structured_headers = []
|
17
|
+
reset_stack
|
18
|
+
end
|
19
|
+
|
20
|
+
def call
|
21
|
+
headers_list.each do |header|
|
22
|
+
next if header_higher_than_top_level?(header)
|
23
|
+
|
24
|
+
if header.top_level?
|
25
|
+
add_top_level(header)
|
26
|
+
elsif header_at_same_level_as_prev?(header)
|
27
|
+
add_sibling(header)
|
28
|
+
elsif header_one_level_lower_than_prev?(header)
|
29
|
+
add_child(header)
|
30
|
+
elsif header_at_higher_level_than_prev?(header)
|
31
|
+
add_uncle_or_aunt(header)
|
32
|
+
else
|
33
|
+
next # ignore semantically invalid headers
|
34
|
+
end
|
35
|
+
|
36
|
+
stack.push(header)
|
37
|
+
end
|
38
|
+
|
39
|
+
structured_headers
|
40
|
+
end
|
41
|
+
|
42
|
+
attr_reader :doc, :stack, :structured_headers
|
43
|
+
private :doc, :stack, :structured_headers
|
44
|
+
|
45
|
+
def headers_list
|
46
|
+
@headers_list ||= doc.headers.map { |h|
|
47
|
+
StructuredHeader.new(h.text, h.level, h.id, [])
|
48
|
+
}
|
49
|
+
end
|
50
|
+
|
51
|
+
def add_top_level(header)
|
52
|
+
structured_headers.push(header)
|
53
|
+
reset_stack
|
54
|
+
end
|
55
|
+
|
56
|
+
def add_sibling(header)
|
57
|
+
stack.pop
|
58
|
+
stack.last.headers << header
|
59
|
+
end
|
60
|
+
|
61
|
+
def add_child(header)
|
62
|
+
stack.last.headers << header
|
63
|
+
end
|
64
|
+
|
65
|
+
def add_uncle_or_aunt(header)
|
66
|
+
pop_stack_to_level(header)
|
67
|
+
stack.last.headers << header
|
68
|
+
end
|
69
|
+
|
70
|
+
def header_higher_than_top_level?(header)
|
71
|
+
header.level < header.top_level
|
72
|
+
end
|
73
|
+
|
74
|
+
def header_at_same_level_as_prev?(header)
|
75
|
+
stack.last && stack.last.level == header.level
|
76
|
+
end
|
77
|
+
|
78
|
+
def header_one_level_lower_than_prev?(header)
|
79
|
+
# lower level means level integer is higher
|
80
|
+
stack.last && (stack.last.level - header.level == -1)
|
81
|
+
end
|
82
|
+
|
83
|
+
def header_at_higher_level_than_prev?(header)
|
84
|
+
# higher level means level integer is lower
|
85
|
+
stack.last && (stack.last.level > header.level)
|
86
|
+
end
|
87
|
+
|
88
|
+
def pop_stack_to_level(header)
|
89
|
+
times_to_pop = stack.last.level - header.level + 1
|
90
|
+
times_to_pop.times { stack.pop }
|
91
|
+
end
|
92
|
+
|
93
|
+
def reset_stack
|
94
|
+
@stack = []
|
95
|
+
end
|
96
|
+
end
|
97
|
+
end
|
data/lib/govspeak/version.rb
CHANGED
@@ -0,0 +1,103 @@
|
|
1
|
+
require 'test_helper'
|
2
|
+
|
3
|
+
class GovspeakStructuredHeadersTest < Test::Unit::TestCase
|
4
|
+
|
5
|
+
def document_body
|
6
|
+
%{
|
7
|
+
## Heading 1
|
8
|
+
|
9
|
+
## Heading 2
|
10
|
+
|
11
|
+
### Sub heading 2.1
|
12
|
+
|
13
|
+
### Sub heading 2.2
|
14
|
+
|
15
|
+
#### Sub sub heading 2.2.1
|
16
|
+
|
17
|
+
### Sub heading 2.3
|
18
|
+
|
19
|
+
## Heading 3
|
20
|
+
|
21
|
+
## Heading 4
|
22
|
+
|
23
|
+
### Sub heading 4.1
|
24
|
+
|
25
|
+
#### Sub heading 4.1.1
|
26
|
+
|
27
|
+
##### Sub heading 4.1.1.1
|
28
|
+
|
29
|
+
### Sub heading 4.2
|
30
|
+
|
31
|
+
## Heading 5
|
32
|
+
|
33
|
+
}
|
34
|
+
end
|
35
|
+
|
36
|
+
def doc
|
37
|
+
@doc ||= Govspeak::Document.new(document_body)
|
38
|
+
end
|
39
|
+
|
40
|
+
def structured_headers
|
41
|
+
doc.structured_headers
|
42
|
+
end
|
43
|
+
|
44
|
+
test "Headings with no sub-headings have an empty headings collection" do
|
45
|
+
assert_empty structured_headers.first.headers
|
46
|
+
end
|
47
|
+
|
48
|
+
test "h2s are extracted as top level headings" do
|
49
|
+
expected_headings = ["Heading 1", "Heading 2", "Heading 3", "Heading 4", "Heading 5"]
|
50
|
+
|
51
|
+
assert_equal expected_headings, structured_headers.map(&:text)
|
52
|
+
end
|
53
|
+
|
54
|
+
test "headings can have multiple sub-headings" do
|
55
|
+
expected_heading_texts = ["Sub heading 2.1", "Sub heading 2.2", "Sub heading 2.3"]
|
56
|
+
assert_equal expected_heading_texts, structured_headers[1].headers.map(&:text)
|
57
|
+
end
|
58
|
+
|
59
|
+
test "h3 following h2s are nested within them" do
|
60
|
+
assert_equal "Sub heading 2.1", structured_headers[1].headers[0].text
|
61
|
+
end
|
62
|
+
|
63
|
+
test "h4 following h3s are nested within them" do
|
64
|
+
assert_equal "Sub sub heading 2.2.1", structured_headers[1].headers[1].headers[0].text
|
65
|
+
end
|
66
|
+
|
67
|
+
test "h3 can follow an h5" do
|
68
|
+
assert_equal "Sub heading 4.2", structured_headers[3].headers[1].text
|
69
|
+
end
|
70
|
+
|
71
|
+
def invalid_document_body
|
72
|
+
%{
|
73
|
+
### Invalid heading (h3)
|
74
|
+
|
75
|
+
## Heading 1
|
76
|
+
|
77
|
+
#### Invalid heading (h4)
|
78
|
+
|
79
|
+
### Sub heading 1.1
|
80
|
+
|
81
|
+
# Invalid heading (h1)
|
82
|
+
|
83
|
+
}
|
84
|
+
end
|
85
|
+
|
86
|
+
def invalid_doc
|
87
|
+
@invalid_doc ||= Govspeak::Document.new(invalid_document_body)
|
88
|
+
end
|
89
|
+
|
90
|
+
def invalid_structured_headers
|
91
|
+
invalid_doc.structured_headers
|
92
|
+
end
|
93
|
+
|
94
|
+
test "semantically invalid headers are ignored" do
|
95
|
+
assert_equal ["Heading 1"], invalid_structured_headers.map(&:text)
|
96
|
+
|
97
|
+
assert_equal ["Sub heading 1.1"], invalid_structured_headers.first.headers.map(&:text)
|
98
|
+
end
|
99
|
+
|
100
|
+
test "document with single h1 produces no headers" do
|
101
|
+
assert_equal [], Govspeak::Document.new("# Heading\n").structured_headers
|
102
|
+
end
|
103
|
+
end
|
metadata
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: govspeak
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 1.
|
4
|
+
version: 1.4.0
|
5
5
|
prerelease:
|
6
6
|
platform: ruby
|
7
7
|
authors:
|
@@ -10,7 +10,7 @@ authors:
|
|
10
10
|
autorequire:
|
11
11
|
bindir: bin
|
12
12
|
cert_chain: []
|
13
|
-
date: 2014-
|
13
|
+
date: 2014-03-05 00:00:00.000000000 Z
|
14
14
|
dependencies:
|
15
15
|
- !ruby/object:Gem::Dependency
|
16
16
|
name: kramdown
|
@@ -140,13 +140,16 @@ files:
|
|
140
140
|
- lib/govspeak/version.rb
|
141
141
|
- lib/govspeak/html_sanitizer.rb
|
142
142
|
- lib/govspeak/header_extractor.rb
|
143
|
+
- lib/govspeak/structured_header_extractor.rb
|
143
144
|
- README.md
|
145
|
+
- CHANGELOG.md
|
144
146
|
- Gemfile
|
145
147
|
- Rakefile
|
146
148
|
- test/govspeak_test_helper.rb
|
147
149
|
- test/html_validator_test.rb
|
148
150
|
- test/govspeak_test.rb
|
149
151
|
- test/html_sanitizer_test.rb
|
152
|
+
- test/govspeak_structured_headers_test.rb
|
150
153
|
- test/test_helper.rb
|
151
154
|
homepage: http://github.com/alphagov/govspeak
|
152
155
|
licenses: []
|
@@ -162,7 +165,7 @@ required_ruby_version: !ruby/object:Gem::Requirement
|
|
162
165
|
version: '0'
|
163
166
|
segments:
|
164
167
|
- 0
|
165
|
-
hash:
|
168
|
+
hash: -3957971095230574253
|
166
169
|
required_rubygems_version: !ruby/object:Gem::Requirement
|
167
170
|
none: false
|
168
171
|
requirements:
|
@@ -171,7 +174,7 @@ required_rubygems_version: !ruby/object:Gem::Requirement
|
|
171
174
|
version: '0'
|
172
175
|
segments:
|
173
176
|
- 0
|
174
|
-
hash:
|
177
|
+
hash: -3957971095230574253
|
175
178
|
requirements: []
|
176
179
|
rubyforge_project:
|
177
180
|
rubygems_version: 1.8.23
|
@@ -183,4 +186,5 @@ test_files:
|
|
183
186
|
- test/html_validator_test.rb
|
184
187
|
- test/govspeak_test.rb
|
185
188
|
- test/html_sanitizer_test.rb
|
189
|
+
- test/govspeak_structured_headers_test.rb
|
186
190
|
- test/test_helper.rb
|