slaw 0.1.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,171 @@
1
+ # encoding: utf-8
2
+
3
+ module Slaw
4
+ module Parse
5
+ # Helper class to run various cleanup routines on plain text.
6
+ #
7
+ # Some of these routines can safely be run multiple times,
8
+ # others are meant to be run only once.
9
+ class Cleanser
10
+
11
+ # Run general cleanup, such as stripping bad chars and
12
+ # removing unnecessary whitespace. This is idempotent
13
+ # and safe to run multiple times.
14
+ def cleanup(s)
15
+ s = scrub(s)
16
+ s = correct_newlines(s)
17
+ s = fix_quotes(s)
18
+ s = expand_tabs(s)
19
+ s = chomp(s)
20
+ s = enforce_newline(s)
21
+ s = remove_boilerplate(s)
22
+ end
23
+
24
+ # Run deeper introspections and reformat the text, such as
25
+ # unwrapping/re-wrapping lines. These may not be safe to run
26
+ # multiple times.
27
+ def reformat(s)
28
+ s = unbreak_lines(s)
29
+ s = break_lines(s)
30
+ s = strip_toc(s)
31
+ s = enforce_newline(s)
32
+ end
33
+
34
+ # ------------------------------------------------------------------------
35
+
36
+ def remove_empty_lines(s)
37
+ s.gsub(/\n\s*$/, '')
38
+ end
39
+
40
+ # line endings
41
+ def correct_newlines(s)
42
+ s.gsub(/\r\n/, "\n")\
43
+ .gsub(/\r/, "\n")
44
+ end
45
+
46
+ # strip invalid bytes and ones we don't like
47
+ def scrub(s)
48
+ # we often get this unicode codepoint in the string, nuke it
49
+ s.gsub([65532].pack('U*'), '')\
50
+ .gsub(" ", '')
51
+ end
52
+
53
+ def fix_quotes(s)
54
+ # change weird quotes to normal ones
55
+ s.gsub(/‘‘|’’|''/, '"')
56
+ end
57
+
58
+ def expand_tabs(s)
59
+ # tabs to spaces
60
+ s.gsub(/\t/, ' ')
61
+ end
62
+
63
+ def remove_boilerplate(s)
64
+ # nuke any line to do with Sabinet and the government printer
65
+ s.gsub(/^.*Sabinet.*Government Printer.*$/i, '')\
66
+ .gsub(/^.*Provincial Gazette \d+.*$/i, '')\
67
+ .gsub(/^.*Provinsiale Koerant \d+.*$/i, '')\
68
+ .gsub(/^\s*\d+\s*$/, '')\
69
+ # get rid of date lines
70
+ .gsub(/^\d+\s+\w+\s+\d+$/, '')\
71
+ # get rid of page number lines
72
+ .gsub(/^\s*page \d+( of \d+)?\s*\n/i, '')
73
+ end
74
+
75
+ def chomp(s)
76
+ # trailing whitespace at end of lines
77
+ s = s.gsub(/ +$/, '')
78
+
79
+ # whitespace on either side
80
+ s.strip
81
+ end
82
+
83
+ def enforce_newline(s)
84
+ # ensure string ends with a newline
85
+ s.end_with?("\n") ? s : (s + "\n")
86
+ end
87
+
88
+ # make educated guesses about lines that should
89
+ # have been broken but haven't, and break them
90
+ def break_lines(s)
91
+ # often we find a section title munged onto the same line as its first statement
92
+ # eg:
93
+ # foo bar. New section title 62. (1) For the purpose
94
+ s = s.gsub(/\. ([^.]+) (\d+\. \(1\) )/, ".\n" + '\1' + "\n" + '\2')
95
+
96
+ # New section title 62. (1) For the purpose
97
+ s = s.gsub(/(\w) (\d+\. \(1\) )/, '\1' + "\n" + '\2')
98
+
99
+ # (1) foo; (2) bar
100
+ # (1) foo. (2) bar
101
+ s = s.gsub(/(\w{3,}[;.]) (\([0-9a-z]+\))/, "\\1\n\\2")
102
+
103
+ # (1) foo; and (2) bar
104
+ # (1) foo; or (2) bar
105
+ s = s.gsub(/; (and|or) \(/, "; \\1\n(")
106
+
107
+ # The officer-in-Charge may – (a) remove all withered natural... \n(b)
108
+ # We do this last, because by now we should have reconised that (b) should already
109
+ # be on a new line.
110
+ s = s.gsub(/ (\(a\) .+?\n\(b\))/, "\n\\1")
111
+
112
+ # "foo" means ...; "bar" means
113
+ s = s.gsub(/; (["”“][^"”“]+?["”“] means)/, ";\n\\1")
114
+
115
+ s
116
+ end
117
+
118
+ # finds likely candidates for unnecessarily broken lines
119
+ # and them
120
+ def unbreak_lines(s)
121
+ lines = s.split(/\n/)
122
+ output = []
123
+ start_re = /^\s*[a-z]/
124
+ end_re = /[a-z0-9]\s*$/
125
+
126
+ prev = nil
127
+ lines.each_with_index do |line, i|
128
+ if i == 0
129
+ output << line
130
+ else
131
+ prev = output[-1]
132
+
133
+ if line =~ start_re and prev =~ end_re
134
+ output[-1] = prev + ' ' + line
135
+ else
136
+ output << line
137
+ end
138
+ end
139
+ end
140
+
141
+ output.join("\n")
142
+ end
143
+
144
+ # do our best to remove table of contents at the start,
145
+ # it really confuses the grammer
146
+ def strip_toc(s)
147
+ # first, try to find 'TABLE OF CONTENTS' anywhere within the first 4K of text,
148
+ if toc_start = s[0..4096].match(/TABLE OF CONTENTS/i)
149
+
150
+ # grab the first non-blank line after that, it's our end-of-TOC marker
151
+ if eol = s.match(/^(.+?)$/, toc_start.end(0))
152
+ marker = eol[0]
153
+
154
+ # search for the first line that is a prefix of marker (or vv), and delete
155
+ # everything in between
156
+ posn = eol.end(0)
157
+ while m = s.match(/^(.+?)$/, posn)
158
+ if marker.start_with?(m[0]) or m[0].start_with?(marker)
159
+ return s[0...toc_start.begin(0)] + s[m.begin(0)..-1]
160
+ end
161
+
162
+ posn = m.end(0)
163
+ end
164
+ end
165
+ end
166
+
167
+ s
168
+ end
169
+ end
170
+ end
171
+ end
@@ -0,0 +1,26 @@
1
+ module Slaw
2
+ module Parse
3
+ class ParseError < Exception
4
+ attr_accessor :line, :column
5
+
6
+ def initialize(message, opts)
7
+ super(message)
8
+
9
+ self.line = opts[:line]
10
+ self.column = opts[:column]
11
+ end
12
+
13
+ # TODO: move this elsewhere, it's out of context here
14
+ def to_json(g=nil)
15
+ msg = self.message
16
+ msg = msg[0..200] + '...' if msg.length > 200
17
+
18
+ {
19
+ message: msg,
20
+ line: self.line,
21
+ column: self.column,
22
+ }.to_json(g)
23
+ end
24
+ end
25
+ end
26
+ end
@@ -0,0 +1,11 @@
1
+ module Slaw
2
+ module Parse
3
+ module GrammarHelpers
4
+ attr_writer :options
5
+
6
+ def options
7
+ @options ||= {}
8
+ end
9
+ end
10
+ end
11
+ end
@@ -0,0 +1,371 @@
1
+ module Slaw
2
+ module Parse
3
+ module Bylaw
4
+ class Bylaw < Treetop::Runtime::SyntaxNode
5
+ def to_xml(b)
6
+ b.act(contains: "originalVersion") { |b|
7
+ b.meta { |b|
8
+ b.identification(source: "#openbylaws") { |b|
9
+ # TODO: correct values
10
+ b.FRBRWork { |b|
11
+ b.FRBRthis(value: '/za/by-law/locale/1980/name/main')
12
+ b.FRBRuri(value: '/za/by-law/locale/1980/name')
13
+ b.FRBRalias(value: 'By-Law Short Title')
14
+ b.FRBRdate(date: '1980-01-01', name: 'Generation')
15
+ b.FRBRauthor(href: '#council', as: '#author')
16
+ b.FRBRcountry(value: 'za')
17
+ }
18
+ b.FRBRExpression { |b|
19
+ b.FRBRthis(value: '/za/by-law/locale/1980/name/main/eng@')
20
+ b.FRBRuri(value: '/za/by-law/locale/1980/name/eng@')
21
+ b.FRBRdate(date: '1980-01-01', name: 'Generation')
22
+ b.FRBRauthor(href: '#council', as: '#author')
23
+ b.FRBRlanguage(language: 'eng')
24
+ }
25
+ b.FRBRManifestation { |b|
26
+ b.FRBRthis(value: '/za/by-law/locale/1980/name/main/eng@')
27
+ b.FRBRuri(value: '/za/by-law/locale/1980/name/eng@')
28
+ b.FRBRdate(date: Time.now.strftime('%Y-%m-%d'), name: 'Generation')
29
+ b.FRBRauthor(href: '#openbylaws', as: '#author')
30
+ }
31
+ }
32
+
33
+ b.publication(date: '1980-01-01',
34
+ name: 'Province of Western Cape: Provincial Gazette',
35
+ number: 'XXXX',
36
+ showAs: 'Province of Western Cape: Provincial Gazette')
37
+
38
+ b.references(source: "#this") {
39
+ b.TLCOrganization(id: 'openbylaws', href: 'http://openbylaws.org.za', showAs: "openbylaws.org.za")
40
+ b.TLCOrganization(id: 'council', href: '/ontology/organization/za/council.cape-town', showAs: "Cape Town City Council")
41
+ b.TLCRole(id: 'author', href: '/ontology/role/author', showAs: 'Author')
42
+ }
43
+ }
44
+
45
+ if preamble.text_value != ""
46
+ b.preamble { |b|
47
+ preamble.to_xml(b)
48
+ }
49
+ end
50
+
51
+ b.body { |b|
52
+ chapters.elements.each { |e| e.to_xml(b) }
53
+ }
54
+ }
55
+
56
+ schedules.to_xml(b)
57
+ end
58
+ end
59
+
60
+ class Preamble < Treetop::Runtime::SyntaxNode
61
+ def to_xml(b)
62
+ statements.elements.each { |e|
63
+ if not (e.content.text_value =~ /^preamble/i)
64
+ b.p(e.content.text_value)
65
+ end
66
+ }
67
+ end
68
+ end
69
+
70
+ class Part < Treetop::Runtime::SyntaxNode
71
+ def num
72
+ heading.empty? ? nil : heading.num
73
+ end
74
+
75
+ def to_xml(b)
76
+ # do we have a part heading?
77
+ if not heading.empty?
78
+ id = "part-#{num}"
79
+
80
+ # include a chapter number in the id if our parent has one
81
+ if parent and parent.parent.is_a?(Chapter) and parent.parent.num
82
+ id = "chapter-#{parent.parent.num}.#{id}"
83
+ end
84
+
85
+ b.part(id: id) { |b|
86
+ heading.to_xml(b)
87
+ sections.elements.each { |e| e.to_xml(b) }
88
+ }
89
+ else
90
+ # no parts
91
+ sections.elements.each { |e| e.to_xml(b) }
92
+ end
93
+ end
94
+ end
95
+
96
+ class PartHeading < Treetop::Runtime::SyntaxNode
97
+ def num
98
+ part_heading_prefix.alphanums.text_value
99
+ end
100
+
101
+ def title
102
+ content.text_value
103
+ end
104
+
105
+ def to_xml(b)
106
+ b.num(num)
107
+ b.heading(title)
108
+ end
109
+ end
110
+
111
+ class Chapter < Treetop::Runtime::SyntaxNode
112
+ def num
113
+ heading.empty? ? nil : heading.num
114
+ end
115
+
116
+ def to_xml(b)
117
+ # do we have a chapter heading?
118
+ if not heading.empty?
119
+ id = "chapter-#{num}"
120
+
121
+ # include a part number in the id if our parent has one
122
+ if parent and parent.parent.is_a?(Part) and parent.parent.num
123
+ id = "part-#{parent.parent.num}.#{id}"
124
+ end
125
+
126
+ b.chapter(id: id) { |b|
127
+ heading.to_xml(b)
128
+ parts.elements.each { |e| e.to_xml(b) }
129
+ }
130
+ else
131
+ # no chapters
132
+ parts.elements.each { |e| e.to_xml(b) }
133
+ end
134
+ end
135
+ end
136
+
137
+ class ChapterHeading < Treetop::Runtime::SyntaxNode
138
+ def num
139
+ chapter_heading_prefix.alphanums.text_value
140
+ end
141
+
142
+ def title
143
+ if self.respond_to? :heading
144
+ heading.content.text_value
145
+ elsif self.respond_to? :content
146
+ content.text_value
147
+ end
148
+ end
149
+
150
+ def to_xml(b)
151
+ b.num(num)
152
+ b.heading(title) if title
153
+ end
154
+ end
155
+
156
+ class Section < Treetop::Runtime::SyntaxNode
157
+ def num
158
+ section_title.num
159
+ end
160
+
161
+ def title
162
+ section_title.title
163
+ end
164
+
165
+ def to_xml(b)
166
+ id = "section-#{num}"
167
+ b.section(id: id) { |b|
168
+ b.num("#{num}.")
169
+ b.heading(title)
170
+
171
+ idprefix = "#{id}."
172
+
173
+ subsections.elements.each_with_index { |e, i| e.to_xml(b, i, idprefix) }
174
+ }
175
+ end
176
+ end
177
+
178
+ class SectionTitleType1 < Treetop::Runtime::SyntaxNode
179
+ # a section title of the form:
180
+ #
181
+ # Definitions
182
+ # 1. In this by-law...
183
+
184
+ def num
185
+ section_title_prefix.number_letter.text_value
186
+ end
187
+
188
+ def title
189
+ content.text_value
190
+ end
191
+ end
192
+
193
+ class SectionTitleType2 < Treetop::Runtime::SyntaxNode
194
+ # a section title of the form:
195
+ #
196
+ # 1. Definitions
197
+ # In this by-law...
198
+ #
199
+ # In this format, the title is optional and the section content may
200
+ # start where we think the title is.
201
+
202
+ def num
203
+ section_title_prefix.number_letter.text_value
204
+ end
205
+
206
+ def title
207
+ section_title.empty? ? "" : section_title.content.text_value
208
+ end
209
+ end
210
+
211
+ class Subsection < Treetop::Runtime::SyntaxNode
212
+ def to_xml(b, i, idprefix)
213
+ if statement.is_a?(NumberedStatement)
214
+ attribs = {id: idprefix + statement.num.gsub(/[()]/, '')}
215
+ else
216
+ attribs = {id: idprefix + "subsection-#{i}"}
217
+ end
218
+
219
+ idprefix = attribs[:id] + "."
220
+
221
+ b.subsection(attribs) { |b|
222
+ b.num(statement.num) if statement.is_a?(NumberedStatement)
223
+
224
+ b.content { |b|
225
+ if blocklist and blocklist.is_a?(Blocklist)
226
+ if statement.content
227
+ blocklist.to_xml(b, i, idprefix) { |b| b << statement.content.text_value }
228
+ else
229
+ blocklist.to_xml(b, i, idprefix)
230
+ end
231
+ else
232
+ # raw content
233
+ b.p(statement.content.text_value) if statement.content
234
+ end
235
+ }
236
+ }
237
+ end
238
+ end
239
+
240
+ class NumberedStatement < Treetop::Runtime::SyntaxNode
241
+ def num
242
+ numbered_statement_prefix.num.text_value
243
+ end
244
+
245
+ def parentheses?
246
+ !numbered_statement_prefix.respond_to? :dotted_number_2
247
+ end
248
+
249
+ def content
250
+ if elements[3].text_value == ""
251
+ nil
252
+ else
253
+ elements[3].content
254
+ end
255
+ end
256
+ end
257
+
258
+ class NakedStatement < Treetop::Runtime::SyntaxNode
259
+ end
260
+
261
+ class Blocklist < Treetop::Runtime::SyntaxNode
262
+ # Render a block list to xml. If a block is given,
263
+ # yield to it a builder to insert a listIntroduction node
264
+ def to_xml(b, i, idprefix, &block)
265
+ id = idprefix + "list#{i}"
266
+ idprefix = id + '.'
267
+
268
+ b.blockList(id: id) { |b|
269
+ b.listIntroduction { |b| yield b } if block_given?
270
+
271
+ elements.each { |e| e.to_xml(b, idprefix) }
272
+ }
273
+ end
274
+ end
275
+
276
+ class BlocklistItem < Treetop::Runtime::SyntaxNode
277
+ def num
278
+ blocklist_item_prefix.text_value
279
+ end
280
+
281
+ def content
282
+ # TODO this really seems a bit odd
283
+ item_content.content.text_value if respond_to? :item_content and item_content.respond_to? :content
284
+ end
285
+
286
+ def to_xml(b, idprefix)
287
+ b.item(id: idprefix + num.gsub(/[()]/, '')) { |b|
288
+ b.num(num)
289
+ b.p(content) if content
290
+ }
291
+ end
292
+ end
293
+
294
+ class ScheduleContainer < Treetop::Runtime::SyntaxNode
295
+ def to_xml(b)
296
+ return if schedules.elements.empty?
297
+
298
+ b.components { |b|
299
+ b.component(id: 'component-0') { |b|
300
+ b.doc(name: 'schedules') { |b|
301
+ b.meta { |b|
302
+ b.identification(source: "#openbylaws") { |b|
303
+ b.FRBRWork { |b|
304
+ b.FRBRthis(value: '/za/by-law/locale/1980/name/main/schedules')
305
+ b.FRBRuri(value: '/za/by-law/locale/1980/name/schedules')
306
+ b.FRBRdate(date: '1980-01-01', name: 'Generation')
307
+ b.FRBRauthor(href: '#council', as: '#author')
308
+ b.FRBRcountry(value: 'za')
309
+ }
310
+ b.FRBRExpression { |b|
311
+ b.FRBRthis(value: '/za/by-law/locale/1980/name/main//schedules/eng@')
312
+ b.FRBRuri(value: '/za/by-law/locale/1980/name/schedules/eng@')
313
+ b.FRBRdate(date: '1980-01-01', name: 'Generation')
314
+ b.FRBRauthor(href: '#council', as: '#author')
315
+ b.FRBRlanguage(language: 'eng')
316
+ }
317
+ b.FRBRManifestation { |b|
318
+ b.FRBRthis(value: '/za/by-law/locale/1980/name/main/schedules/eng@')
319
+ b.FRBRuri(value: '/za/by-law/locale/1980/name/schedules/eng@')
320
+ b.FRBRdate(date: Time.now.strftime('%Y-%m-%d'), name: 'Generation')
321
+ b.FRBRauthor(href: '#openbylaws', as: '#author')
322
+ }
323
+ }
324
+ }
325
+
326
+ b.mainBody { |b|
327
+ schedules.elements.each_with_index { |e, i| e.to_xml(b, i) }
328
+ }
329
+ }
330
+ }
331
+ }
332
+ end
333
+ end
334
+
335
+ class Schedule < Treetop::Runtime::SyntaxNode
336
+ def num
337
+ n = schedule_heading.num.text_value
338
+ return (n && !n.empty?) ? n : nil
339
+ end
340
+
341
+ def heading
342
+ if schedule_heading.schedule_title.respond_to? :content
343
+ schedule_heading.schedule_title.content.text_value
344
+ else
345
+ nil
346
+ end
347
+ end
348
+
349
+ def to_xml(b, i)
350
+ n = num
351
+ id = if n
352
+ "schedule-#{n}"
353
+ else
354
+ "schedules"
355
+ end
356
+
357
+ b.chapter(id: id) { |b|
358
+ b.num(num) if num
359
+ b.heading(heading) if heading
360
+
361
+ b.section(id: id + ".section-0") { |b|
362
+ b.content { |b|
363
+ statements.elements.each { |e| b.p(e.content.text_value) }
364
+ }
365
+ }
366
+ }
367
+ end
368
+ end
369
+ end
370
+ end
371
+ end
@@ -0,0 +1,53 @@
1
+ module Slaw
2
+ module Render
3
+
4
+ # Support for transforming XML AN documents into HTML.
5
+ class HTMLRenderer
6
+ def initialize
7
+ here = File.dirname(__FILE__)
8
+
9
+ @xslt = {
10
+ act: Nokogiri::XSLT(File.open(File.join([here, 'xsl/act.xsl']))),
11
+ fragment: Nokogiri::XSLT(File.open(File.join([here, 'xsl/fragment.xsl']))),
12
+ }
13
+ end
14
+
15
+ # Transform an entire XML document +doc+ (a Nokogiri::XML::Document object) into HTML.
16
+ # Specify +base_url+ to manage the base for relative URLs generated by
17
+ # the transform.
18
+ def render(doc, base_url='')
19
+ params = transform_params({'base_url' => base_url})
20
+ run_xslt(:act, doc, params)
21
+ end
22
+
23
+ # Transform just a single node and its children into HTML.
24
+ #
25
+ # If +elem+ has an id, we use xpath to tell the XSLT which
26
+ # element to transform. Otherwise we copy the node into a new
27
+ # tree and apply the XSLT to that.
28
+ def render_node(node, base_url='')
29
+ params = transform_params({'base_url' => base_url})
30
+
31
+ if node.id
32
+ params += ['root_elem', "//*[@id='#{node.id}']"]
33
+ doc = node.document
34
+ else
35
+ # create a new document with just this element at the root
36
+ doc = Nokogiri::XML::Document.new
37
+ doc.root = node
38
+ params += ['root_elem', '*']
39
+ end
40
+
41
+ run_xslt(:fragment, doc, params)
42
+ end
43
+
44
+ def run_xslt(xslt, doc, params)
45
+ @xslt[xslt].transform(doc, params).to_s
46
+ end
47
+
48
+ def transform_params(params)
49
+ Nokogiri::XSLT.quote_params(params)
50
+ end
51
+ end
52
+ end
53
+ end
@@ -0,0 +1,15 @@
1
+ <?xml version="1.0"?>
2
+ <xsl:stylesheet xmlns:xsl="http://www.w3.org/1999/XSL/Transform" version="1.0"
3
+ xmlns:a="http://www.akomantoso.org/2.0"
4
+ exclude-result-prefixes="a">
5
+
6
+ <xsl:import href="elements.xsl" />
7
+
8
+ <xsl:output method="html" />
9
+
10
+ <xsl:template match="/">
11
+ <xsl:apply-templates select="a:akomaNtoso/a:act" />
12
+ </xsl:template>
13
+
14
+ </xsl:stylesheet>
15
+