slaw 0.1.2

Sign up to get free protection for your applications and to get access to all the features.
@@ -0,0 +1,171 @@
1
+ # encoding: utf-8
2
+
3
+ module Slaw
4
+ module Parse
5
+ # Helper class to run various cleanup routines on plain text.
6
+ #
7
+ # Some of these routines can safely be run multiple times,
8
+ # others are meant to be run only once.
9
+ class Cleanser
10
+
11
+ # Run general cleanup, such as stripping bad chars and
12
+ # removing unnecessary whitespace. This is idempotent
13
+ # and safe to run multiple times.
14
+ def cleanup(s)
15
+ s = scrub(s)
16
+ s = correct_newlines(s)
17
+ s = fix_quotes(s)
18
+ s = expand_tabs(s)
19
+ s = chomp(s)
20
+ s = enforce_newline(s)
21
+ s = remove_boilerplate(s)
22
+ end
23
+
24
+ # Run deeper introspections and reformat the text, such as
25
+ # unwrapping/re-wrapping lines. These may not be safe to run
26
+ # multiple times.
27
+ def reformat(s)
28
+ s = unbreak_lines(s)
29
+ s = break_lines(s)
30
+ s = strip_toc(s)
31
+ s = enforce_newline(s)
32
+ end
33
+
34
+ # ------------------------------------------------------------------------
35
+
36
+ def remove_empty_lines(s)
37
+ s.gsub(/\n\s*$/, '')
38
+ end
39
+
40
+ # line endings
41
+ def correct_newlines(s)
42
+ s.gsub(/\r\n/, "\n")\
43
+ .gsub(/\r/, "\n")
44
+ end
45
+
46
+ # strip invalid bytes and ones we don't like
47
+ def scrub(s)
48
+ # we often get this unicode codepoint in the string, nuke it
49
+ s.gsub([65532].pack('U*'), '')\
50
+ .gsub(" ", '')
51
+ end
52
+
53
+ def fix_quotes(s)
54
+ # change weird quotes to normal ones
55
+ s.gsub(/‘‘|’’|''/, '"')
56
+ end
57
+
58
+ def expand_tabs(s)
59
+ # tabs to spaces
60
+ s.gsub(/\t/, ' ')
61
+ end
62
+
63
+ def remove_boilerplate(s)
64
+ # nuke any line to do with Sabinet and the government printer
65
+ s.gsub(/^.*Sabinet.*Government Printer.*$/i, '')\
66
+ .gsub(/^.*Provincial Gazette \d+.*$/i, '')\
67
+ .gsub(/^.*Provinsiale Koerant \d+.*$/i, '')\
68
+ .gsub(/^\s*\d+\s*$/, '')\
69
+ # get rid of date lines
70
+ .gsub(/^\d+\s+\w+\s+\d+$/, '')\
71
+ # get rid of page number lines
72
+ .gsub(/^\s*page \d+( of \d+)?\s*\n/i, '')
73
+ end
74
+
75
+ def chomp(s)
76
+ # trailing whitespace at end of lines
77
+ s = s.gsub(/ +$/, '')
78
+
79
+ # whitespace on either side
80
+ s.strip
81
+ end
82
+
83
+ def enforce_newline(s)
84
+ # ensure string ends with a newline
85
+ s.end_with?("\n") ? s : (s + "\n")
86
+ end
87
+
88
+ # make educated guesses about lines that should
89
+ # have been broken but haven't, and break them
90
+ def break_lines(s)
91
+ # often we find a section title munged onto the same line as its first statement
92
+ # eg:
93
+ # foo bar. New section title 62. (1) For the purpose
94
+ s = s.gsub(/\. ([^.]+) (\d+\. \(1\) )/, ".\n" + '\1' + "\n" + '\2')
95
+
96
+ # New section title 62. (1) For the purpose
97
+ s = s.gsub(/(\w) (\d+\. \(1\) )/, '\1' + "\n" + '\2')
98
+
99
+ # (1) foo; (2) bar
100
+ # (1) foo. (2) bar
101
+ s = s.gsub(/(\w{3,}[;.]) (\([0-9a-z]+\))/, "\\1\n\\2")
102
+
103
+ # (1) foo; and (2) bar
104
+ # (1) foo; or (2) bar
105
+ s = s.gsub(/; (and|or) \(/, "; \\1\n(")
106
+
107
+ # The officer-in-Charge may – (a) remove all withered natural... \n(b)
108
+ # We do this last, because by now we should have reconised that (b) should already
109
+ # be on a new line.
110
+ s = s.gsub(/ (\(a\) .+?\n\(b\))/, "\n\\1")
111
+
112
+ # "foo" means ...; "bar" means
113
+ s = s.gsub(/; (["”“][^"”“]+?["”“] means)/, ";\n\\1")
114
+
115
+ s
116
+ end
117
+
118
+ # finds likely candidates for unnecessarily broken lines
119
+ # and them
120
+ def unbreak_lines(s)
121
+ lines = s.split(/\n/)
122
+ output = []
123
+ start_re = /^\s*[a-z]/
124
+ end_re = /[a-z0-9]\s*$/
125
+
126
+ prev = nil
127
+ lines.each_with_index do |line, i|
128
+ if i == 0
129
+ output << line
130
+ else
131
+ prev = output[-1]
132
+
133
+ if line =~ start_re and prev =~ end_re
134
+ output[-1] = prev + ' ' + line
135
+ else
136
+ output << line
137
+ end
138
+ end
139
+ end
140
+
141
+ output.join("\n")
142
+ end
143
+
144
+ # do our best to remove table of contents at the start,
145
+ # it really confuses the grammer
146
+ def strip_toc(s)
147
+ # first, try to find 'TABLE OF CONTENTS' anywhere within the first 4K of text,
148
+ if toc_start = s[0..4096].match(/TABLE OF CONTENTS/i)
149
+
150
+ # grab the first non-blank line after that, it's our end-of-TOC marker
151
+ if eol = s.match(/^(.+?)$/, toc_start.end(0))
152
+ marker = eol[0]
153
+
154
+ # search for the first line that is a prefix of marker (or vv), and delete
155
+ # everything in between
156
+ posn = eol.end(0)
157
+ while m = s.match(/^(.+?)$/, posn)
158
+ if marker.start_with?(m[0]) or m[0].start_with?(marker)
159
+ return s[0...toc_start.begin(0)] + s[m.begin(0)..-1]
160
+ end
161
+
162
+ posn = m.end(0)
163
+ end
164
+ end
165
+ end
166
+
167
+ s
168
+ end
169
+ end
170
+ end
171
+ end
@@ -0,0 +1,26 @@
1
+ module Slaw
2
+ module Parse
3
+ class ParseError < Exception
4
+ attr_accessor :line, :column
5
+
6
+ def initialize(message, opts)
7
+ super(message)
8
+
9
+ self.line = opts[:line]
10
+ self.column = opts[:column]
11
+ end
12
+
13
+ # TODO: move this elsewhere, it's out of context here
14
+ def to_json(g=nil)
15
+ msg = self.message
16
+ msg = msg[0..200] + '...' if msg.length > 200
17
+
18
+ {
19
+ message: msg,
20
+ line: self.line,
21
+ column: self.column,
22
+ }.to_json(g)
23
+ end
24
+ end
25
+ end
26
+ end
@@ -0,0 +1,11 @@
1
+ module Slaw
2
+ module Parse
3
+ module GrammarHelpers
4
+ attr_writer :options
5
+
6
+ def options
7
+ @options ||= {}
8
+ end
9
+ end
10
+ end
11
+ end
@@ -0,0 +1,371 @@
1
+ module Slaw
2
+ module Parse
3
+ module Bylaw
4
+ class Bylaw < Treetop::Runtime::SyntaxNode
5
+ def to_xml(b)
6
+ b.act(contains: "originalVersion") { |b|
7
+ b.meta { |b|
8
+ b.identification(source: "#openbylaws") { |b|
9
+ # TODO: correct values
10
+ b.FRBRWork { |b|
11
+ b.FRBRthis(value: '/za/by-law/locale/1980/name/main')
12
+ b.FRBRuri(value: '/za/by-law/locale/1980/name')
13
+ b.FRBRalias(value: 'By-Law Short Title')
14
+ b.FRBRdate(date: '1980-01-01', name: 'Generation')
15
+ b.FRBRauthor(href: '#council', as: '#author')
16
+ b.FRBRcountry(value: 'za')
17
+ }
18
+ b.FRBRExpression { |b|
19
+ b.FRBRthis(value: '/za/by-law/locale/1980/name/main/eng@')
20
+ b.FRBRuri(value: '/za/by-law/locale/1980/name/eng@')
21
+ b.FRBRdate(date: '1980-01-01', name: 'Generation')
22
+ b.FRBRauthor(href: '#council', as: '#author')
23
+ b.FRBRlanguage(language: 'eng')
24
+ }
25
+ b.FRBRManifestation { |b|
26
+ b.FRBRthis(value: '/za/by-law/locale/1980/name/main/eng@')
27
+ b.FRBRuri(value: '/za/by-law/locale/1980/name/eng@')
28
+ b.FRBRdate(date: Time.now.strftime('%Y-%m-%d'), name: 'Generation')
29
+ b.FRBRauthor(href: '#openbylaws', as: '#author')
30
+ }
31
+ }
32
+
33
+ b.publication(date: '1980-01-01',
34
+ name: 'Province of Western Cape: Provincial Gazette',
35
+ number: 'XXXX',
36
+ showAs: 'Province of Western Cape: Provincial Gazette')
37
+
38
+ b.references(source: "#this") {
39
+ b.TLCOrganization(id: 'openbylaws', href: 'http://openbylaws.org.za', showAs: "openbylaws.org.za")
40
+ b.TLCOrganization(id: 'council', href: '/ontology/organization/za/council.cape-town', showAs: "Cape Town City Council")
41
+ b.TLCRole(id: 'author', href: '/ontology/role/author', showAs: 'Author')
42
+ }
43
+ }
44
+
45
+ if preamble.text_value != ""
46
+ b.preamble { |b|
47
+ preamble.to_xml(b)
48
+ }
49
+ end
50
+
51
+ b.body { |b|
52
+ chapters.elements.each { |e| e.to_xml(b) }
53
+ }
54
+ }
55
+
56
+ schedules.to_xml(b)
57
+ end
58
+ end
59
+
60
+ class Preamble < Treetop::Runtime::SyntaxNode
61
+ def to_xml(b)
62
+ statements.elements.each { |e|
63
+ if not (e.content.text_value =~ /^preamble/i)
64
+ b.p(e.content.text_value)
65
+ end
66
+ }
67
+ end
68
+ end
69
+
70
+ class Part < Treetop::Runtime::SyntaxNode
71
+ def num
72
+ heading.empty? ? nil : heading.num
73
+ end
74
+
75
+ def to_xml(b)
76
+ # do we have a part heading?
77
+ if not heading.empty?
78
+ id = "part-#{num}"
79
+
80
+ # include a chapter number in the id if our parent has one
81
+ if parent and parent.parent.is_a?(Chapter) and parent.parent.num
82
+ id = "chapter-#{parent.parent.num}.#{id}"
83
+ end
84
+
85
+ b.part(id: id) { |b|
86
+ heading.to_xml(b)
87
+ sections.elements.each { |e| e.to_xml(b) }
88
+ }
89
+ else
90
+ # no parts
91
+ sections.elements.each { |e| e.to_xml(b) }
92
+ end
93
+ end
94
+ end
95
+
96
+ class PartHeading < Treetop::Runtime::SyntaxNode
97
+ def num
98
+ part_heading_prefix.alphanums.text_value
99
+ end
100
+
101
+ def title
102
+ content.text_value
103
+ end
104
+
105
+ def to_xml(b)
106
+ b.num(num)
107
+ b.heading(title)
108
+ end
109
+ end
110
+
111
+ class Chapter < Treetop::Runtime::SyntaxNode
112
+ def num
113
+ heading.empty? ? nil : heading.num
114
+ end
115
+
116
+ def to_xml(b)
117
+ # do we have a chapter heading?
118
+ if not heading.empty?
119
+ id = "chapter-#{num}"
120
+
121
+ # include a part number in the id if our parent has one
122
+ if parent and parent.parent.is_a?(Part) and parent.parent.num
123
+ id = "part-#{parent.parent.num}.#{id}"
124
+ end
125
+
126
+ b.chapter(id: id) { |b|
127
+ heading.to_xml(b)
128
+ parts.elements.each { |e| e.to_xml(b) }
129
+ }
130
+ else
131
+ # no chapters
132
+ parts.elements.each { |e| e.to_xml(b) }
133
+ end
134
+ end
135
+ end
136
+
137
+ class ChapterHeading < Treetop::Runtime::SyntaxNode
138
+ def num
139
+ chapter_heading_prefix.alphanums.text_value
140
+ end
141
+
142
+ def title
143
+ if self.respond_to? :heading
144
+ heading.content.text_value
145
+ elsif self.respond_to? :content
146
+ content.text_value
147
+ end
148
+ end
149
+
150
+ def to_xml(b)
151
+ b.num(num)
152
+ b.heading(title) if title
153
+ end
154
+ end
155
+
156
+ class Section < Treetop::Runtime::SyntaxNode
157
+ def num
158
+ section_title.num
159
+ end
160
+
161
+ def title
162
+ section_title.title
163
+ end
164
+
165
+ def to_xml(b)
166
+ id = "section-#{num}"
167
+ b.section(id: id) { |b|
168
+ b.num("#{num}.")
169
+ b.heading(title)
170
+
171
+ idprefix = "#{id}."
172
+
173
+ subsections.elements.each_with_index { |e, i| e.to_xml(b, i, idprefix) }
174
+ }
175
+ end
176
+ end
177
+
178
+ class SectionTitleType1 < Treetop::Runtime::SyntaxNode
179
+ # a section title of the form:
180
+ #
181
+ # Definitions
182
+ # 1. In this by-law...
183
+
184
+ def num
185
+ section_title_prefix.number_letter.text_value
186
+ end
187
+
188
+ def title
189
+ content.text_value
190
+ end
191
+ end
192
+
193
+ class SectionTitleType2 < Treetop::Runtime::SyntaxNode
194
+ # a section title of the form:
195
+ #
196
+ # 1. Definitions
197
+ # In this by-law...
198
+ #
199
+ # In this format, the title is optional and the section content may
200
+ # start where we think the title is.
201
+
202
+ def num
203
+ section_title_prefix.number_letter.text_value
204
+ end
205
+
206
+ def title
207
+ section_title.empty? ? "" : section_title.content.text_value
208
+ end
209
+ end
210
+
211
+ class Subsection < Treetop::Runtime::SyntaxNode
212
+ def to_xml(b, i, idprefix)
213
+ if statement.is_a?(NumberedStatement)
214
+ attribs = {id: idprefix + statement.num.gsub(/[()]/, '')}
215
+ else
216
+ attribs = {id: idprefix + "subsection-#{i}"}
217
+ end
218
+
219
+ idprefix = attribs[:id] + "."
220
+
221
+ b.subsection(attribs) { |b|
222
+ b.num(statement.num) if statement.is_a?(NumberedStatement)
223
+
224
+ b.content { |b|
225
+ if blocklist and blocklist.is_a?(Blocklist)
226
+ if statement.content
227
+ blocklist.to_xml(b, i, idprefix) { |b| b << statement.content.text_value }
228
+ else
229
+ blocklist.to_xml(b, i, idprefix)
230
+ end
231
+ else
232
+ # raw content
233
+ b.p(statement.content.text_value) if statement.content
234
+ end
235
+ }
236
+ }
237
+ end
238
+ end
239
+
240
+ class NumberedStatement < Treetop::Runtime::SyntaxNode
241
+ def num
242
+ numbered_statement_prefix.num.text_value
243
+ end
244
+
245
+ def parentheses?
246
+ !numbered_statement_prefix.respond_to? :dotted_number_2
247
+ end
248
+
249
+ def content
250
+ if elements[3].text_value == ""
251
+ nil
252
+ else
253
+ elements[3].content
254
+ end
255
+ end
256
+ end
257
+
258
+ class NakedStatement < Treetop::Runtime::SyntaxNode
259
+ end
260
+
261
+ class Blocklist < Treetop::Runtime::SyntaxNode
262
+ # Render a block list to xml. If a block is given,
263
+ # yield to it a builder to insert a listIntroduction node
264
+ def to_xml(b, i, idprefix, &block)
265
+ id = idprefix + "list#{i}"
266
+ idprefix = id + '.'
267
+
268
+ b.blockList(id: id) { |b|
269
+ b.listIntroduction { |b| yield b } if block_given?
270
+
271
+ elements.each { |e| e.to_xml(b, idprefix) }
272
+ }
273
+ end
274
+ end
275
+
276
+ class BlocklistItem < Treetop::Runtime::SyntaxNode
277
+ def num
278
+ blocklist_item_prefix.text_value
279
+ end
280
+
281
+ def content
282
+ # TODO this really seems a bit odd
283
+ item_content.content.text_value if respond_to? :item_content and item_content.respond_to? :content
284
+ end
285
+
286
+ def to_xml(b, idprefix)
287
+ b.item(id: idprefix + num.gsub(/[()]/, '')) { |b|
288
+ b.num(num)
289
+ b.p(content) if content
290
+ }
291
+ end
292
+ end
293
+
294
+ class ScheduleContainer < Treetop::Runtime::SyntaxNode
295
+ def to_xml(b)
296
+ return if schedules.elements.empty?
297
+
298
+ b.components { |b|
299
+ b.component(id: 'component-0') { |b|
300
+ b.doc(name: 'schedules') { |b|
301
+ b.meta { |b|
302
+ b.identification(source: "#openbylaws") { |b|
303
+ b.FRBRWork { |b|
304
+ b.FRBRthis(value: '/za/by-law/locale/1980/name/main/schedules')
305
+ b.FRBRuri(value: '/za/by-law/locale/1980/name/schedules')
306
+ b.FRBRdate(date: '1980-01-01', name: 'Generation')
307
+ b.FRBRauthor(href: '#council', as: '#author')
308
+ b.FRBRcountry(value: 'za')
309
+ }
310
+ b.FRBRExpression { |b|
311
+ b.FRBRthis(value: '/za/by-law/locale/1980/name/main//schedules/eng@')
312
+ b.FRBRuri(value: '/za/by-law/locale/1980/name/schedules/eng@')
313
+ b.FRBRdate(date: '1980-01-01', name: 'Generation')
314
+ b.FRBRauthor(href: '#council', as: '#author')
315
+ b.FRBRlanguage(language: 'eng')
316
+ }
317
+ b.FRBRManifestation { |b|
318
+ b.FRBRthis(value: '/za/by-law/locale/1980/name/main/schedules/eng@')
319
+ b.FRBRuri(value: '/za/by-law/locale/1980/name/schedules/eng@')
320
+ b.FRBRdate(date: Time.now.strftime('%Y-%m-%d'), name: 'Generation')
321
+ b.FRBRauthor(href: '#openbylaws', as: '#author')
322
+ }
323
+ }
324
+ }
325
+
326
+ b.mainBody { |b|
327
+ schedules.elements.each_with_index { |e, i| e.to_xml(b, i) }
328
+ }
329
+ }
330
+ }
331
+ }
332
+ end
333
+ end
334
+
335
+ class Schedule < Treetop::Runtime::SyntaxNode
336
+ def num
337
+ n = schedule_heading.num.text_value
338
+ return (n && !n.empty?) ? n : nil
339
+ end
340
+
341
+ def heading
342
+ if schedule_heading.schedule_title.respond_to? :content
343
+ schedule_heading.schedule_title.content.text_value
344
+ else
345
+ nil
346
+ end
347
+ end
348
+
349
+ def to_xml(b, i)
350
+ n = num
351
+ id = if n
352
+ "schedule-#{n}"
353
+ else
354
+ "schedules"
355
+ end
356
+
357
+ b.chapter(id: id) { |b|
358
+ b.num(num) if num
359
+ b.heading(heading) if heading
360
+
361
+ b.section(id: id + ".section-0") { |b|
362
+ b.content { |b|
363
+ statements.elements.each { |e| b.p(e.content.text_value) }
364
+ }
365
+ }
366
+ }
367
+ end
368
+ end
369
+ end
370
+ end
371
+ end
@@ -0,0 +1,53 @@
1
+ module Slaw
2
+ module Render
3
+
4
+ # Support for transforming XML AN documents into HTML.
5
+ class HTMLRenderer
6
+ def initialize
7
+ here = File.dirname(__FILE__)
8
+
9
+ @xslt = {
10
+ act: Nokogiri::XSLT(File.open(File.join([here, 'xsl/act.xsl']))),
11
+ fragment: Nokogiri::XSLT(File.open(File.join([here, 'xsl/fragment.xsl']))),
12
+ }
13
+ end
14
+
15
+ # Transform an entire XML document +doc+ (a Nokogiri::XML::Document object) into HTML.
16
+ # Specify +base_url+ to manage the base for relative URLs generated by
17
+ # the transform.
18
+ def render(doc, base_url='')
19
+ params = transform_params({'base_url' => base_url})
20
+ run_xslt(:act, doc, params)
21
+ end
22
+
23
+ # Transform just a single node and its children into HTML.
24
+ #
25
+ # If +elem+ has an id, we use xpath to tell the XSLT which
26
+ # element to transform. Otherwise we copy the node into a new
27
+ # tree and apply the XSLT to that.
28
+ def render_node(node, base_url='')
29
+ params = transform_params({'base_url' => base_url})
30
+
31
+ if node.id
32
+ params += ['root_elem', "//*[@id='#{node.id}']"]
33
+ doc = node.document
34
+ else
35
+ # create a new document with just this element at the root
36
+ doc = Nokogiri::XML::Document.new
37
+ doc.root = node
38
+ params += ['root_elem', '*']
39
+ end
40
+
41
+ run_xslt(:fragment, doc, params)
42
+ end
43
+
44
+ def run_xslt(xslt, doc, params)
45
+ @xslt[xslt].transform(doc, params).to_s
46
+ end
47
+
48
+ def transform_params(params)
49
+ Nokogiri::XSLT.quote_params(params)
50
+ end
51
+ end
52
+ end
53
+ end
@@ -0,0 +1,15 @@
1
+ <?xml version="1.0"?>
2
+ <xsl:stylesheet xmlns:xsl="http://www.w3.org/1999/XSL/Transform" version="1.0"
3
+ xmlns:a="http://www.akomantoso.org/2.0"
4
+ exclude-result-prefixes="a">
5
+
6
+ <xsl:import href="elements.xsl" />
7
+
8
+ <xsl:output method="html" />
9
+
10
+ <xsl:template match="/">
11
+ <xsl:apply-templates select="a:akomaNtoso/a:act" />
12
+ </xsl:template>
13
+
14
+ </xsl:stylesheet>
15
+