slaw 0.1.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,181 @@
1
+ module Slaw
2
+ module Parse
3
+ module Blocklists
4
+ include Slaw::Namespace
5
+
6
+ # Correctly re-nest nested block lists.
7
+ #
8
+ # (a)
9
+ # (b)
10
+ # (i)
11
+ # (ii)
12
+ # (aa)
13
+ # (bb)
14
+ # (c)
15
+ # (d)
16
+ #
17
+ # becomes
18
+ #
19
+ # (a)
20
+ # (b)
21
+ # (i)
22
+ # (ii)
23
+ # (aa)
24
+ # (bb)
25
+ # (c)
26
+ # (d)
27
+ #
28
+ def self.nest_blocklists(doc)
29
+ doc.xpath('//a:blockList', a: NS).each do |blocklist|
30
+ items = blocklist.xpath('a:item', a: NS)
31
+ nest_blocklist_items(items.to_a, guess_number_format(items.first), nil, nil)
32
+ end
33
+ end
34
+
35
+ # New blocklist nesting, starting with +item+ as its
36
+ # first element.
37
+ def self.nest_blocklist_items(items, our_number_format, list, prev)
38
+ return if items.empty?
39
+ item = items.shift
40
+
41
+ sublist_count = 0
42
+
43
+ while item and item.name == 'item'
44
+ number_format = guess_number_format(item, number_format)
45
+ break unless number_format
46
+
47
+ if number_format != our_number_format
48
+ # new sublist, or back to the old list?
49
+ if number_format < our_number_format
50
+ # back to the old list
51
+ items.unshift(item)
52
+ break
53
+ else
54
+ # new sublist.
55
+ #
56
+ # The blockList is inserted as a child of the sibling just before
57
+ # +item+, and that sibling's content is moved into the
58
+ # +listIntroduction+ of the new list.
59
+ sublist = item.document.create_element('blockList', id: prev['id'] + ".list#{sublist_count}")
60
+ sublist_count += 1
61
+
62
+ # list intro
63
+ num = prev.at_xpath('a:num', a: NS)
64
+ if intro = num.next_element
65
+ intro.name = 'listIntroduction'
66
+ sublist << intro
67
+ end
68
+
69
+ # make +item+ the first in this list
70
+ item['id'] = sublist['id'] + ".#{item.num.gsub(/[()]/, '')}"
71
+ sublist << item
72
+
73
+ # insert this list as a child of the previous item
74
+ prev << sublist
75
+
76
+ # now keep walking item's (old) siblings
77
+ # and pull in those elements that match our numbering
78
+ # scheme
79
+ nest_blocklist_items(items, number_format, sublist, item)
80
+ end
81
+ else
82
+ # same number format
83
+
84
+ # if this num is (i), we're numbering in :i, this isn't the first
85
+ # element in this list, then assume we're following (h) with (i)
86
+ if number_format.type == :i && item.num == "(i)" && prev
87
+ items.unshift(item)
88
+ break
89
+ else
90
+ # keep it with this list
91
+ if list
92
+ list << item
93
+ item['id'] = list['id'] + ".#{item.num.gsub(/[()]/, '')}"
94
+ end
95
+ end
96
+ end
97
+
98
+ prev = item
99
+ item = items.shift
100
+ end
101
+ end
102
+
103
+ def self.guess_number_format(item, prev_format=nil)
104
+ return nil unless item.num
105
+
106
+ prev = item.previous_element
107
+ nxt = item.next_element
108
+
109
+ case item.num
110
+ when "(i)"
111
+ # Special case to detect difference between:
112
+ #
113
+ # (h) foo
114
+ # (i) bar
115
+ # (j) baz
116
+ #
117
+ # and
118
+ #
119
+ # (h) foo
120
+ # (i) bar
121
+ # (ii) baz
122
+ #
123
+ # (i) is NOT a sublist if:
124
+ # - there was a previous item (h), and
125
+ # - there is not a next item, or
126
+ # - the next item is something other than (ii)
127
+ if prev and prev.num =~ /^\(h/ and (!nxt or nxt.num != "(ii)")
128
+ NumberingFormat.a
129
+ else
130
+ NumberingFormat.i
131
+ end
132
+ when "(u)", "(v)", "(x)"
133
+ prev_format
134
+ when /^\([ivx]+/
135
+ NumberingFormat.i
136
+ when /^\([a-z][a-z]/
137
+ NumberingFormat.aa
138
+ when /^\([a-z]+/i
139
+ NumberingFormat.a
140
+ when /^\d+(\.\d+)+$/
141
+ NumberingFormat.new(:'i.i', item.num.count('.'))
142
+ else
143
+ NumberingFormat.unknown
144
+ end
145
+ end
146
+
147
+ class NumberingFormat
148
+ include Comparable
149
+
150
+ attr_accessor :type, :ordinal
151
+
152
+ def initialize(type, ordinal)
153
+ @type = type
154
+ @ordinal = ordinal
155
+ end
156
+
157
+ def eql?(other)
158
+ self.ordinal == other.ordinal
159
+ end
160
+
161
+ def <=>(other)
162
+ self.ordinal <=> other.ordinal
163
+ end
164
+
165
+ def to_s
166
+ @type.to_s
167
+ end
168
+
169
+ @@a = NumberingFormat.new(:a, 0)
170
+ @@i = NumberingFormat.new(:i, 1)
171
+ @@aa = NumberingFormat.new(:aa, 2)
172
+ @@unknown = NumberingFormat.new(:unknown, 3)
173
+
174
+ def self.a; @@a; end
175
+ def self.i; @@i; end
176
+ def self.aa; @@aa; end
177
+ def self.unknown; @@unknown; end
178
+ end
179
+ end
180
+ end
181
+ end
@@ -0,0 +1,263 @@
1
+ require 'builder'
2
+ require 'treetop'
3
+
4
+ module Slaw
5
+ module Parse
6
+ # Primary class for building Akoma Ntoso documents.
7
+ #
8
+ # It can convert from plain text a new Akoma Ntoso document, or
9
+ # update existing documents.
10
+ class Builder
11
+ include Slaw::Namespace
12
+ include Slaw::Logging
13
+
14
+ Treetop.load(File.dirname(__FILE__) + "/bylaw.treetop")
15
+
16
+ attr_accessor :parse_options
17
+
18
+ def initialize()
19
+ @parse_options = {}
20
+ end
21
+
22
+ # Try to parse plain text into a syntax tree
23
+ def text_to_syntax_tree(text, root=:bylaw)
24
+ parser = Slaw::Parse::BylawParser.new
25
+ parser.options = @parse_options
26
+
27
+ tree = parser.parse(text, {root: root})
28
+
29
+ if tree.nil?
30
+ raise Slaw::Parse::ParseError.new(parser.failure_reason || "Couldn't match to grammar",
31
+ line: parser.failure_line || 0,
32
+ column: parser.failure_column || 0)
33
+ end
34
+
35
+ tree
36
+ end
37
+
38
+ # Generate an XML document from the given syntax tree.
39
+ def xml_from_syntax_tree(tree)
40
+ s = ""
41
+ builder = ::Builder::XmlMarkup.new(indent: 2, target: s)
42
+
43
+ builder.instruct! :xml, :version=>"1.0", :encoding=>"UTF-8"
44
+ builder.akomaNtoso("xmlns:xsi"=> "http://www.w3.org/2001/XMLSchema-instance",
45
+ "xsi:schemaLocation" => "http://www.akomantoso.org/2.0 akomantoso20.xsd",
46
+ "xmlns" => NS) { |b|
47
+ tree.to_xml(b)
48
+ }
49
+
50
+ s
51
+ end
52
+
53
+ def parse_xml(xml)
54
+ Nokogiri::XML(xml, &:noblanks)
55
+ end
56
+
57
+ def to_xml(doc)
58
+ doc.to_xml(indent: 2)
59
+ end
60
+
61
+ # Run various postprocesses on the XML, and return
62
+ # the updated XML.
63
+ def postprocess(doc)
64
+ normalise_headings(doc)
65
+ find_short_title(doc)
66
+ sanitise(doc)
67
+ end
68
+
69
+ # Do sanitisations, such as finding and linking definitions
70
+ def sanitise(doc)
71
+ link_definitions(doc)
72
+ nest_blocklists(doc)
73
+ end
74
+
75
+ # recalculate ids for <term> elements
76
+ def renumber_terms(doc)
77
+ logger.info("Renumbering terms")
78
+
79
+ doc.xpath('//a:term', a: NS).each_with_index do |term, i|
80
+ term['id'] = "trm#{i}"
81
+ end
82
+ end
83
+
84
+ # Change CAPCASE headings into Sentence case.
85
+ def normalise_headings(doc)
86
+ logger.info("Normalising headings")
87
+
88
+ nodes = doc.xpath('//a:body//a:heading/text()', a: NS) +
89
+ doc.xpath('//a:component/a:doc[@name="schedules"]//a:heading/text()', a: NS)
90
+
91
+ nodes.each do |heading|
92
+ heading.content = heading.content.downcase.gsub(/^\w/) { $&.upcase }
93
+ end
94
+ end
95
+
96
+ # Find the short title and add it as an FRBRalias element in the meta section
97
+ def find_short_title(doc)
98
+ logger.info("Finding short title")
99
+
100
+ # Short title and commencement
101
+ # 8. This Act shall be called the Legal Aid Amendment Act, 1996, and shall come
102
+ # into operation on a date fixed by the President by proclamation in the Gazette.
103
+
104
+ doc.xpath('//a:body//a:heading[contains(text(), "hort title")]', a: NS).each do |heading|
105
+ section = heading.parent.at_xpath('a:subsection', a: NS)
106
+ if section and section.text =~ /this act (is|shall be called) the (([a-zA-Z\(\)]\s*)+, \d\d\d\d)/i
107
+ short_title = $2
108
+
109
+ logger.info("+ Found title: #{short_title}")
110
+
111
+ node = doc.at_xpath('//a:meta//a:FRBRalias', a: NS)
112
+ node['value'] = short_title
113
+ break
114
+ end
115
+ end
116
+ end
117
+
118
+ # Find definitions of terms and introduce them into the
119
+ # meta section of the document.
120
+ def link_definitions(doc)
121
+ logger.info("Finding and linking definitions")
122
+
123
+ terms = find_definitions(doc)
124
+ add_terms_to_references(doc, terms)
125
+ find_term_references(doc, terms)
126
+ renumber_terms(doc)
127
+ end
128
+
129
+ def find_definitions(doc)
130
+ guess_at_definitions(doc)
131
+
132
+ terms = {}
133
+ doc.xpath('//a:def', a: NS).each do |defn|
134
+ # <p>"<def refersTo="#term-affected_land">affected land</def>" means land in respect of which an application has been lodged in terms of section 17(1);</p>
135
+ id = defn['refersTo'].sub(/^#/, '')
136
+ term = defn.content
137
+ terms[id] = term
138
+
139
+ logger.info("+ Found definition for: #{term}")
140
+ end
141
+
142
+ terms
143
+ end
144
+
145
+ def guess_at_definitions(doc)
146
+ doc.xpath('//a:section', a: NS).select do |section|
147
+ # sections with headings like Definitions
148
+ heading = section.at_xpath('a:heading', a: NS)
149
+ heading && heading.content =~ /definitions|interpretation/i
150
+ end.each do |section|
151
+ # find items like "foo" means blah...
152
+
153
+ section.xpath('.//a:p|.//a:listIntroduction', a: NS).each do |container|
154
+ # only if we don't already have a definition here
155
+ next if container.at_xpath('a:def', a: NS)
156
+
157
+ # get first text node
158
+ text = container.children.first
159
+ next if (not text or not text.text?)
160
+
161
+ match = /^\s*["“”](.+?)["“”]/.match(text.text)
162
+ if match
163
+ term = match.captures[0]
164
+ term_id = 'term-' + term.gsub(/[^a-zA-Z0-9_-]/, '_')
165
+
166
+ # <p>"<def refersTo="#term-affected_land">affected land</def>" means land in respect of which an application has been lodged in terms of section 17(1);</p>
167
+ defn = doc.create_element('def', term, refersTo: "##{term_id}")
168
+ rest = match.post_match
169
+
170
+ text.before(defn)
171
+ defn.before(doc.create_text_node('"'))
172
+ text.content = '"' + rest
173
+
174
+ # adjust the container's id
175
+ parent = find_up(container, ['blockList', 'point']) || find_up(container, ['subsection', 'section'])
176
+ parent['id'] = "def-#{term_id}"
177
+ end
178
+ end
179
+ end
180
+ end
181
+
182
+ def add_terms_to_references(doc, terms)
183
+ refs = doc.at_xpath('//a:meta/a:references', a: NS)
184
+ unless refs
185
+ refs = doc.create_element('references', source: "#this")
186
+ doc.at_xpath('//a:meta/a:identification', a: NS).after(refs)
187
+ end
188
+
189
+ # nuke all existing term reference elements
190
+ refs.xpath('a:TLCTerm', a: NS).each { |el| el.remove }
191
+
192
+ for id, term in terms
193
+ # <TLCTerm id="term-applicant" href="/ontology/term/this.eng.applicant" showAs="Applicant"/>
194
+ refs << doc.create_element('TLCTerm',
195
+ id: id,
196
+ href: "/ontology/term/this.eng.#{id.gsub(/^term-/, '')}",
197
+ showAs: term)
198
+ end
199
+ end
200
+
201
+ # Find and decorate references to terms in the document.
202
+ # The +terms+ param is a hash from term_id to actual term.
203
+ def find_term_references(doc, terms)
204
+ logger.info("+ Finding references to terms")
205
+
206
+ i = 0
207
+
208
+ # sort terms by the length of the defined term, desc,
209
+ # so that we don't find short terms inside longer
210
+ # terms
211
+ terms = terms.to_a.sort_by { |pair| -pair[1].size }
212
+
213
+ # look for each term
214
+ for term_id, term in terms
215
+ doc.xpath('//a:body//text()', a: NS).each do |text|
216
+ # replace all occurrences in this text node
217
+
218
+ # unless we're already inside a def or term element
219
+ next if (["def", "term"].include?(text.parent.name))
220
+
221
+ # don't link to a term inside its own definition
222
+ owner = find_up(text, 'subsection')
223
+ next if owner and owner.at_xpath(".//a:def[@refersTo='##{term_id}']", a: NS)
224
+
225
+ while posn = (text.content =~ /\b#{Regexp::escape(term)}\b/)
226
+ # <p>A delegation under subsection (1) shall not prevent the <term refersTo="#term-Minister" id="trm357">Minister</term> from exercising the power himself or herself.</p>
227
+ node = doc.create_element('term', term, refersTo: "##{term_id}", id: "trm#{i}")
228
+
229
+ pre = (posn > 0) ? text.content[0..posn-1] : nil
230
+ post = text.content[posn+term.length..-1]
231
+
232
+ text.before(node)
233
+ node.before(doc.create_text_node(pre)) if pre
234
+ text.content = post
235
+
236
+ i += 1
237
+ end
238
+ end
239
+ end
240
+ end
241
+
242
+ def nest_blocklists(doc)
243
+ logger.info("Nesting blocklists")
244
+
245
+ Slaw::Parse::Blocklists.nest_blocklists(doc)
246
+ end
247
+
248
+ protected
249
+
250
+ # Look up the parent chain for an element that matches the given
251
+ # node name
252
+ def find_up(node, names)
253
+ names = Array(names)
254
+
255
+ for parent in node.ancestors
256
+ return parent if names.include?(parent.name)
257
+ end
258
+
259
+ nil
260
+ end
261
+ end
262
+ end
263
+ end
@@ -0,0 +1,259 @@
1
+ require 'slaw/parse/grammar_helpers'
2
+
3
+ module Slaw
4
+ module Parse
5
+ grammar Bylaw
6
+ include Slaw::Parse::GrammarHelpers
7
+
8
+ ########
9
+ # major containers
10
+
11
+ rule bylaw
12
+ preamble
13
+ chapters:chapter*
14
+ schedules:schedules <Bylaw>
15
+ end
16
+
17
+ rule preamble
18
+ empty_line*
19
+ statements:naked_statement* <Preamble>
20
+ end
21
+
22
+ rule chapter
23
+ heading:chapter_heading?
24
+ parts:part+ <Chapter>
25
+ end
26
+
27
+ rule part
28
+ heading:part_heading?
29
+ sections:section+ <Part>
30
+ end
31
+
32
+ rule section
33
+ section_title
34
+ subsections:subsection* <Section>
35
+ end
36
+
37
+ rule subsection
38
+ statement:(numbered_statement / naked_statement)
39
+ blocklist:blocklist? <Subsection>
40
+ end
41
+
42
+ rule schedules
43
+ schedules:schedule* <ScheduleContainer>
44
+ end
45
+
46
+ rule schedule
47
+ schedule_heading
48
+ statements:schedule_statement* <Schedule>
49
+ end
50
+
51
+ ##########
52
+ # headings
53
+
54
+ rule chapter_heading
55
+ space? chapter_heading_prefix heading:(whitespace content)? eol
56
+ <ChapterHeading>
57
+ end
58
+
59
+ rule part_heading
60
+ space? part_heading_prefix eol? space? content eol
61
+ <PartHeading>
62
+ end
63
+
64
+ rule section_title
65
+ section_title_1 / section_1_title
66
+ end
67
+
68
+ rule section_title_1
69
+ &{ |s| options[:section_number_after_title] }
70
+ # Section title
71
+ # 1. Section content
72
+ content eol
73
+ section_title_prefix whitespace <SectionTitleType1>
74
+ end
75
+
76
+ rule section_1_title
77
+ # 1. Section title
78
+ # Section content
79
+ #
80
+ # Additionally, the section title is optional.
81
+ !{ |s| options[:section_number_after_title] }
82
+ section_title_prefix section_title:section_title_content? eol?
83
+ <SectionTitleType2>
84
+ end
85
+
86
+ rule section_title_content
87
+ space !numbered_statement_prefix content eol
88
+ # if a section title ends in a non-character or it's really long, it's probably section content
89
+ !{ |s| s[2].text_value[-1] =~ /[^a-zA-z]/ or s[2].text_value.length > 100 }
90
+ end
91
+
92
+ rule schedule_heading
93
+ space? schedule_heading_prefix
94
+ space? "\""? num:alphanums? "\""? space?
95
+ eol
96
+ schedule_title:schedule_title_content?
97
+ end
98
+
99
+ rule schedule_title_content
100
+ space? content eol
101
+
102
+ # if a schedule title ends in a non-character or it's really long, it's probably content
103
+ !{ |s| s[1].text_value[-1] =~ /[^a-zA-z]/ or s[1].text_value.length > 100 }
104
+ end
105
+
106
+ ##########
107
+ # statements
108
+
109
+ rule numbered_statement
110
+ space? numbered_statement_prefix whitespace? (!blocklist_item_prefix content eol)? <NumberedStatement>
111
+ end
112
+
113
+ rule naked_statement
114
+ space? !(part / chapter / section / schedule) content eol
115
+ <NakedStatement>
116
+ end
117
+
118
+ rule schedule_statement
119
+ space? (!schedule_heading) content eol
120
+ end
121
+
122
+ ##########
123
+ # prefixes
124
+
125
+ rule part_heading_prefix
126
+ 'part'i space alphanums ':'?
127
+ end
128
+
129
+ rule chapter_heading_prefix
130
+ 'chapter'i space alphanums
131
+ end
132
+
133
+ rule schedule_heading_prefix
134
+ 'schedule'i 's'i?
135
+ end
136
+
137
+ rule section_title_prefix
138
+ number_letter '.'?
139
+ end
140
+
141
+ rule numbered_statement_prefix
142
+ # there are two subsection handling syntaxes:
143
+ #
144
+ # (1) foo
145
+ # (2A) foo
146
+ #
147
+ # and
148
+ #
149
+ # 8.2 for
150
+ # 8.3 bar
151
+ #
152
+ # The second is less common, but this allows us to handle it.
153
+ # Note that it is usually accompanied by a similar list number format:
154
+ #
155
+ # 8.2.1 item 1
156
+ # 8.2.2 item 2
157
+ #
158
+ # which aren't subsections, but lists, so force the space at the end
159
+ # of the number to catch this case.
160
+ num:('(' number_letter ')')
161
+ /
162
+ num:dotted_number_2 '.'? space
163
+ end
164
+
165
+ ##########
166
+ # blocklists
167
+
168
+ rule blocklist
169
+ blocklist_item+ <Blocklist>
170
+ end
171
+
172
+ rule blocklist_item
173
+ space? blocklist_item_prefix whitespace? item_content:(!blocklist_item_prefix content eol)?
174
+ <BlocklistItem>
175
+ end
176
+
177
+ rule blocklist_item_prefix
178
+ ('(' letter_ordinal ')') / dotted_number_3
179
+ end
180
+
181
+ rule letter_ordinal
182
+ letter (letter / digit)*
183
+ end
184
+
185
+ #########
186
+ ## one line of basic content
187
+
188
+ rule content
189
+ # anything but a newline, followed by a
190
+ # newline or end of file (without consuming the newline)
191
+ [^\n]+ &eol
192
+ end
193
+
194
+ ##########
195
+ # terminals
196
+
197
+ # eg. 2, 2A, 2b
198
+ rule number_letter
199
+ number letter*
200
+ end
201
+
202
+ rule dotted_number_3
203
+ number '.' number ('.' number)+
204
+ end
205
+
206
+ rule dotted_number_2
207
+ number '.' number
208
+ end
209
+
210
+ rule number
211
+ digit+
212
+ end
213
+
214
+ rule digit
215
+ [0-9]
216
+ end
217
+
218
+ rule letter
219
+ [a-zA-Z]
220
+ end
221
+
222
+ rule alphanums
223
+ [a-zA-Z0-9]+
224
+ end
225
+
226
+ rule quotes
227
+ ["“”]
228
+ end
229
+
230
+ rule non_quotes
231
+ [^"“”]
232
+ end
233
+
234
+ ##########
235
+ # whitespace
236
+
237
+ rule space
238
+ [ \t]+
239
+ end
240
+
241
+ rule whitespace
242
+ [ \t\n]*
243
+ end
244
+
245
+ rule empty_line
246
+ space? newline
247
+ end
248
+
249
+ rule eol
250
+ newline
251
+ empty_line*
252
+ end
253
+
254
+ rule newline
255
+ "\n"
256
+ end
257
+ end
258
+ end
259
+ end