govspeak 1.3.0 → 1.4.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/CHANGELOG.md +4 -0
- data/lib/govspeak.rb +5 -0
- data/lib/govspeak/structured_header_extractor.rb +97 -0
- data/lib/govspeak/version.rb +1 -1
- data/test/govspeak_structured_headers_test.rb +103 -0
- metadata +8 -4
data/CHANGELOG.md
ADDED
data/lib/govspeak.rb
CHANGED
@@ -1,5 +1,6 @@
|
|
1
1
|
require 'kramdown'
|
2
2
|
require 'govspeak/header_extractor'
|
3
|
+
require 'govspeak/structured_header_extractor'
|
3
4
|
require 'govspeak/html_validator'
|
4
5
|
require 'govspeak/html_sanitizer'
|
5
6
|
require 'kramdown/parser/kramdown_with_automatic_external_links'
|
@@ -55,6 +56,10 @@ module Govspeak
|
|
55
56
|
Govspeak::HeaderExtractor.convert(kramdown_doc).first
|
56
57
|
end
|
57
58
|
|
59
|
+
def structured_headers
|
60
|
+
Govspeak::StructuredHeaderExtractor.new(self).call
|
61
|
+
end
|
62
|
+
|
58
63
|
def preprocess(source)
|
59
64
|
@@extensions.each do |title,regexp,block|
|
60
65
|
source.gsub!(regexp) {|match|
|
@@ -0,0 +1,97 @@
|
|
1
|
+
module Govspeak
|
2
|
+
|
3
|
+
StructuredHeader = Struct.new(:text, :level, :id, :headers) do
|
4
|
+
def top_level
|
5
|
+
2
|
6
|
+
end
|
7
|
+
|
8
|
+
def top_level?
|
9
|
+
level == top_level
|
10
|
+
end
|
11
|
+
end
|
12
|
+
|
13
|
+
class StructuredHeaderExtractor
|
14
|
+
def initialize(document)
|
15
|
+
@doc = document
|
16
|
+
@structured_headers = []
|
17
|
+
reset_stack
|
18
|
+
end
|
19
|
+
|
20
|
+
def call
|
21
|
+
headers_list.each do |header|
|
22
|
+
next if header_higher_than_top_level?(header)
|
23
|
+
|
24
|
+
if header.top_level?
|
25
|
+
add_top_level(header)
|
26
|
+
elsif header_at_same_level_as_prev?(header)
|
27
|
+
add_sibling(header)
|
28
|
+
elsif header_one_level_lower_than_prev?(header)
|
29
|
+
add_child(header)
|
30
|
+
elsif header_at_higher_level_than_prev?(header)
|
31
|
+
add_uncle_or_aunt(header)
|
32
|
+
else
|
33
|
+
next # ignore semantically invalid headers
|
34
|
+
end
|
35
|
+
|
36
|
+
stack.push(header)
|
37
|
+
end
|
38
|
+
|
39
|
+
structured_headers
|
40
|
+
end
|
41
|
+
|
42
|
+
attr_reader :doc, :stack, :structured_headers
|
43
|
+
private :doc, :stack, :structured_headers
|
44
|
+
|
45
|
+
def headers_list
|
46
|
+
@headers_list ||= doc.headers.map { |h|
|
47
|
+
StructuredHeader.new(h.text, h.level, h.id, [])
|
48
|
+
}
|
49
|
+
end
|
50
|
+
|
51
|
+
def add_top_level(header)
|
52
|
+
structured_headers.push(header)
|
53
|
+
reset_stack
|
54
|
+
end
|
55
|
+
|
56
|
+
def add_sibling(header)
|
57
|
+
stack.pop
|
58
|
+
stack.last.headers << header
|
59
|
+
end
|
60
|
+
|
61
|
+
def add_child(header)
|
62
|
+
stack.last.headers << header
|
63
|
+
end
|
64
|
+
|
65
|
+
def add_uncle_or_aunt(header)
|
66
|
+
pop_stack_to_level(header)
|
67
|
+
stack.last.headers << header
|
68
|
+
end
|
69
|
+
|
70
|
+
def header_higher_than_top_level?(header)
|
71
|
+
header.level < header.top_level
|
72
|
+
end
|
73
|
+
|
74
|
+
def header_at_same_level_as_prev?(header)
|
75
|
+
stack.last && stack.last.level == header.level
|
76
|
+
end
|
77
|
+
|
78
|
+
def header_one_level_lower_than_prev?(header)
|
79
|
+
# lower level means level integer is higher
|
80
|
+
stack.last && (stack.last.level - header.level == -1)
|
81
|
+
end
|
82
|
+
|
83
|
+
def header_at_higher_level_than_prev?(header)
|
84
|
+
# higher level means level integer is lower
|
85
|
+
stack.last && (stack.last.level > header.level)
|
86
|
+
end
|
87
|
+
|
88
|
+
def pop_stack_to_level(header)
|
89
|
+
times_to_pop = stack.last.level - header.level + 1
|
90
|
+
times_to_pop.times { stack.pop }
|
91
|
+
end
|
92
|
+
|
93
|
+
def reset_stack
|
94
|
+
@stack = []
|
95
|
+
end
|
96
|
+
end
|
97
|
+
end
|
data/lib/govspeak/version.rb
CHANGED
@@ -0,0 +1,103 @@
|
|
1
|
+
require 'test_helper'
|
2
|
+
|
3
|
+
class GovspeakStructuredHeadersTest < Test::Unit::TestCase
|
4
|
+
|
5
|
+
def document_body
|
6
|
+
%{
|
7
|
+
## Heading 1
|
8
|
+
|
9
|
+
## Heading 2
|
10
|
+
|
11
|
+
### Sub heading 2.1
|
12
|
+
|
13
|
+
### Sub heading 2.2
|
14
|
+
|
15
|
+
#### Sub sub heading 2.2.1
|
16
|
+
|
17
|
+
### Sub heading 2.3
|
18
|
+
|
19
|
+
## Heading 3
|
20
|
+
|
21
|
+
## Heading 4
|
22
|
+
|
23
|
+
### Sub heading 4.1
|
24
|
+
|
25
|
+
#### Sub heading 4.1.1
|
26
|
+
|
27
|
+
##### Sub heading 4.1.1.1
|
28
|
+
|
29
|
+
### Sub heading 4.2
|
30
|
+
|
31
|
+
## Heading 5
|
32
|
+
|
33
|
+
}
|
34
|
+
end
|
35
|
+
|
36
|
+
def doc
|
37
|
+
@doc ||= Govspeak::Document.new(document_body)
|
38
|
+
end
|
39
|
+
|
40
|
+
def structured_headers
|
41
|
+
doc.structured_headers
|
42
|
+
end
|
43
|
+
|
44
|
+
test "Headings with no sub-headings have an empty headings collection" do
|
45
|
+
assert_empty structured_headers.first.headers
|
46
|
+
end
|
47
|
+
|
48
|
+
test "h2s are extracted as top level headings" do
|
49
|
+
expected_headings = ["Heading 1", "Heading 2", "Heading 3", "Heading 4", "Heading 5"]
|
50
|
+
|
51
|
+
assert_equal expected_headings, structured_headers.map(&:text)
|
52
|
+
end
|
53
|
+
|
54
|
+
test "headings can have multiple sub-headings" do
|
55
|
+
expected_heading_texts = ["Sub heading 2.1", "Sub heading 2.2", "Sub heading 2.3"]
|
56
|
+
assert_equal expected_heading_texts, structured_headers[1].headers.map(&:text)
|
57
|
+
end
|
58
|
+
|
59
|
+
test "h3 following h2s are nested within them" do
|
60
|
+
assert_equal "Sub heading 2.1", structured_headers[1].headers[0].text
|
61
|
+
end
|
62
|
+
|
63
|
+
test "h4 following h3s are nested within them" do
|
64
|
+
assert_equal "Sub sub heading 2.2.1", structured_headers[1].headers[1].headers[0].text
|
65
|
+
end
|
66
|
+
|
67
|
+
test "h3 can follow an h5" do
|
68
|
+
assert_equal "Sub heading 4.2", structured_headers[3].headers[1].text
|
69
|
+
end
|
70
|
+
|
71
|
+
def invalid_document_body
|
72
|
+
%{
|
73
|
+
### Invalid heading (h3)
|
74
|
+
|
75
|
+
## Heading 1
|
76
|
+
|
77
|
+
#### Invalid heading (h4)
|
78
|
+
|
79
|
+
### Sub heading 1.1
|
80
|
+
|
81
|
+
# Invalid heading (h1)
|
82
|
+
|
83
|
+
}
|
84
|
+
end
|
85
|
+
|
86
|
+
def invalid_doc
|
87
|
+
@invalid_doc ||= Govspeak::Document.new(invalid_document_body)
|
88
|
+
end
|
89
|
+
|
90
|
+
def invalid_structured_headers
|
91
|
+
invalid_doc.structured_headers
|
92
|
+
end
|
93
|
+
|
94
|
+
test "semantically invalid headers are ignored" do
|
95
|
+
assert_equal ["Heading 1"], invalid_structured_headers.map(&:text)
|
96
|
+
|
97
|
+
assert_equal ["Sub heading 1.1"], invalid_structured_headers.first.headers.map(&:text)
|
98
|
+
end
|
99
|
+
|
100
|
+
test "document with single h1 produces no headers" do
|
101
|
+
assert_equal [], Govspeak::Document.new("# Heading\n").structured_headers
|
102
|
+
end
|
103
|
+
end
|
metadata
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: govspeak
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 1.
|
4
|
+
version: 1.4.0
|
5
5
|
prerelease:
|
6
6
|
platform: ruby
|
7
7
|
authors:
|
@@ -10,7 +10,7 @@ authors:
|
|
10
10
|
autorequire:
|
11
11
|
bindir: bin
|
12
12
|
cert_chain: []
|
13
|
-
date: 2014-
|
13
|
+
date: 2014-03-05 00:00:00.000000000 Z
|
14
14
|
dependencies:
|
15
15
|
- !ruby/object:Gem::Dependency
|
16
16
|
name: kramdown
|
@@ -140,13 +140,16 @@ files:
|
|
140
140
|
- lib/govspeak/version.rb
|
141
141
|
- lib/govspeak/html_sanitizer.rb
|
142
142
|
- lib/govspeak/header_extractor.rb
|
143
|
+
- lib/govspeak/structured_header_extractor.rb
|
143
144
|
- README.md
|
145
|
+
- CHANGELOG.md
|
144
146
|
- Gemfile
|
145
147
|
- Rakefile
|
146
148
|
- test/govspeak_test_helper.rb
|
147
149
|
- test/html_validator_test.rb
|
148
150
|
- test/govspeak_test.rb
|
149
151
|
- test/html_sanitizer_test.rb
|
152
|
+
- test/govspeak_structured_headers_test.rb
|
150
153
|
- test/test_helper.rb
|
151
154
|
homepage: http://github.com/alphagov/govspeak
|
152
155
|
licenses: []
|
@@ -162,7 +165,7 @@ required_ruby_version: !ruby/object:Gem::Requirement
|
|
162
165
|
version: '0'
|
163
166
|
segments:
|
164
167
|
- 0
|
165
|
-
hash:
|
168
|
+
hash: -3957971095230574253
|
166
169
|
required_rubygems_version: !ruby/object:Gem::Requirement
|
167
170
|
none: false
|
168
171
|
requirements:
|
@@ -171,7 +174,7 @@ required_rubygems_version: !ruby/object:Gem::Requirement
|
|
171
174
|
version: '0'
|
172
175
|
segments:
|
173
176
|
- 0
|
174
|
-
hash:
|
177
|
+
hash: -3957971095230574253
|
175
178
|
requirements: []
|
176
179
|
rubyforge_project:
|
177
180
|
rubygems_version: 1.8.23
|
@@ -183,4 +186,5 @@ test_files:
|
|
183
186
|
- test/html_validator_test.rb
|
184
187
|
- test/govspeak_test.rb
|
185
188
|
- test/html_sanitizer_test.rb
|
189
|
+
- test/govspeak_structured_headers_test.rb
|
186
190
|
- test/test_helper.rb
|