slaw 0.17.2 → 1.0.0.alpha.1

Sign up to get free protection for your applications and to get access to all the features.
@@ -0,0 +1,307 @@
1
+ require 'slaw/grammars/core_nodes'
2
+
3
+ module Slaw
4
+ module Grammars
5
+ module ZA
6
+ module Act
7
+ class Act < Treetop::Runtime::SyntaxNode
8
+ FRBR_URI = '/za/act/1980/01'
9
+ WORK_URI = FRBR_URI
10
+ EXPRESSION_URI = "#{FRBR_URI}/eng@"
11
+ MANIFESTATION_URI = EXPRESSION_URI
12
+
13
+ def to_xml(b, idprefix=nil, i=0)
14
+ b.act(contains: "originalVersion") { |b|
15
+ write_meta(b)
16
+ write_preface(b)
17
+ write_preamble(b)
18
+ write_body(b)
19
+ }
20
+ write_schedules(b)
21
+ end
22
+
23
+ def write_meta(b)
24
+ b.meta { |b|
25
+ write_identification(b)
26
+
27
+ b.references(source: "#this") {
28
+ b.TLCOrganization(id: 'slaw', href: 'https://github.com/longhotsummer/slaw', showAs: "Slaw")
29
+ b.TLCOrganization(id: 'council', href: '/ontology/organization/za/council', showAs: "Council")
30
+ }
31
+ }
32
+ end
33
+
34
+ def write_identification(b)
35
+ b.identification(source: "#slaw") { |b|
36
+ # use stub values so that we can generate a validating document
37
+ b.FRBRWork { |b|
38
+ b.FRBRthis(value: "#{WORK_URI}/main")
39
+ b.FRBRuri(value: WORK_URI)
40
+ b.FRBRalias(value: 'Short Title')
41
+ b.FRBRdate(date: '1980-01-01', name: 'Generation')
42
+ b.FRBRauthor(href: '#council')
43
+ b.FRBRcountry(value: 'za')
44
+ }
45
+ b.FRBRExpression { |b|
46
+ b.FRBRthis(value: "#{EXPRESSION_URI}/main")
47
+ b.FRBRuri(value: EXPRESSION_URI)
48
+ b.FRBRdate(date: '1980-01-01', name: 'Generation')
49
+ b.FRBRauthor(href: '#council')
50
+ b.FRBRlanguage(language: 'eng')
51
+ }
52
+ b.FRBRManifestation { |b|
53
+ b.FRBRthis(value: "#{MANIFESTATION_URI}/main")
54
+ b.FRBRuri(value: MANIFESTATION_URI)
55
+ b.FRBRdate(date: Time.now.strftime('%Y-%m-%d'), name: 'Generation')
56
+ b.FRBRauthor(href: '#slaw')
57
+ }
58
+ }
59
+ end
60
+
61
+ def write_preface(b)
62
+ preface.to_xml(b) if preface.respond_to? :to_xml
63
+ end
64
+
65
+ def write_preamble(b)
66
+ preamble.to_xml(b) if preamble.respond_to? :to_xml
67
+ end
68
+
69
+ def write_body(b)
70
+ body.to_xml(b)
71
+ end
72
+
73
+ def write_schedules(b)
74
+ if schedules.text_value != ""
75
+ schedules.to_xml(b)
76
+ end
77
+ end
78
+ end
79
+
80
+ class Preface < Treetop::Runtime::SyntaxNode
81
+ def to_xml(b, *args)
82
+ if text_value != ""
83
+ b.preface { |b|
84
+ statements.elements.each { |element|
85
+ for e in element.elements
86
+ e.to_xml(b, "") if e.is_a? Slaw::Grammars::Inlines::NakedStatement
87
+ end
88
+ }
89
+ }
90
+ end
91
+ end
92
+ end
93
+
94
+ class Preamble < Treetop::Runtime::SyntaxNode
95
+ def to_xml(b, *args)
96
+ if text_value != ""
97
+ b.preamble { |b|
98
+ statements.elements.each { |e|
99
+ e.to_xml(b, "")
100
+ }
101
+ }
102
+ end
103
+ end
104
+ end
105
+
106
+ class Part < Treetop::Runtime::SyntaxNode
107
+ def num
108
+ heading.num
109
+ end
110
+
111
+ def to_xml(b, *args)
112
+ id = "part-#{num}"
113
+
114
+ # include a chapter number in the id if our parent has one
115
+ if parent and parent.parent.is_a?(Chapter) and parent.parent.num
116
+ id = "chapter-#{parent.parent.num}.#{id}"
117
+ end
118
+
119
+ b.part(id: id) { |b|
120
+ heading.to_xml(b)
121
+ children.elements.each_with_index { |e, i| e.to_xml(b, id + '.', i) }
122
+ }
123
+ end
124
+ end
125
+
126
+ class PartHeading < Treetop::Runtime::SyntaxNode
127
+ def num
128
+ part_heading_prefix.alphanums.text_value
129
+ end
130
+
131
+ def title
132
+ if heading.text_value and heading.respond_to? :content
133
+ heading.content.text_value.strip
134
+ end
135
+ end
136
+
137
+ def to_xml(b)
138
+ b.num(num)
139
+ b.heading(title) if title
140
+ end
141
+ end
142
+
143
+ class Chapter < Treetop::Runtime::SyntaxNode
144
+ def num
145
+ heading.num
146
+ end
147
+
148
+ def to_xml(b, *args)
149
+ id = "chapter-#{num}"
150
+
151
+ # include a part number in the id if our parent has one
152
+ if parent and parent.parent.is_a?(Part) and parent.parent.num
153
+ id = "part-#{parent.parent.num}.#{id}"
154
+ end
155
+
156
+ b.chapter(id: id) { |b|
157
+ heading.to_xml(b)
158
+ children.elements.each_with_index { |e, i| e.to_xml(b, id + '.', i) }
159
+ }
160
+ end
161
+ end
162
+
163
+ class ChapterHeading < Treetop::Runtime::SyntaxNode
164
+ def num
165
+ chapter_heading_prefix.alphanums.text_value
166
+ end
167
+
168
+ def title
169
+ if heading.text_value and heading.respond_to? :content
170
+ heading.content.text_value.strip
171
+ end
172
+ end
173
+
174
+ def to_xml(b)
175
+ b.num(num)
176
+ b.heading(title) if title
177
+ end
178
+ end
179
+
180
+ class Section < Treetop::Runtime::SyntaxNode
181
+ def num
182
+ section_title.num
183
+ end
184
+
185
+ def title
186
+ section_title.title
187
+ end
188
+
189
+ def to_xml(b, *args)
190
+ id = "section-#{num}"
191
+ b.section(id: id) { |b|
192
+ b.num("#{num}.")
193
+ b.heading(title)
194
+
195
+ idprefix = "#{id}."
196
+
197
+ children.elements.each_with_index { |e, i| e.to_xml(b, idprefix, i) }
198
+ }
199
+ end
200
+ end
201
+
202
+ class SectionTitleType1 < Treetop::Runtime::SyntaxNode
203
+ # a section title of the form:
204
+ #
205
+ # Definitions
206
+ # 1. In this act...
207
+
208
+ def num
209
+ section_title_prefix.number_letter.text_value
210
+ end
211
+
212
+ def title
213
+ content.text_value
214
+ end
215
+ end
216
+
217
+ class SectionTitleType2 < Treetop::Runtime::SyntaxNode
218
+ # a section title of the form:
219
+ #
220
+ # 1. Definitions
221
+ # In this act...
222
+ #
223
+ # In this format, the title is optional and the section content may
224
+ # start where we think the title is.
225
+
226
+ def num
227
+ section_title_prefix.number_letter.text_value
228
+ end
229
+
230
+ def title
231
+ section_title.empty? ? "" : section_title.content.text_value
232
+ end
233
+ end
234
+
235
+ class BlockParagraph < Treetop::Runtime::SyntaxNode
236
+ def to_xml(b, idprefix='', i=0)
237
+ id = "#{idprefix}paragraph-0"
238
+ idprefix = "#{id}."
239
+
240
+ b.paragraph(id: id) { |b|
241
+ b.content { |b|
242
+ elements.each_with_index { |e, i| e.to_xml(b, idprefix, i) }
243
+ }
244
+ }
245
+ end
246
+ end
247
+
248
+ class Subsection < Treetop::Runtime::SyntaxNode
249
+ def num
250
+ subsection_prefix.num.text_value
251
+ end
252
+
253
+ def to_xml(b, idprefix, i)
254
+ id = idprefix + num.gsub(/[()]/, '')
255
+ idprefix = id + "."
256
+
257
+ kids = children.elements
258
+ kids = [first_child] + kids if first_child and !first_child.empty?
259
+
260
+ b.subsection(id: id) { |b|
261
+ b.num(num)
262
+ b.content { |b|
263
+ if kids.empty?
264
+ # schema requires a non-empty content element
265
+ b.p
266
+ else
267
+ kids.each_with_index { |e, i| e.to_xml(b, idprefix, i) }
268
+ end
269
+ }
270
+ }
271
+ end
272
+ end
273
+
274
+ class Blocklist < Treetop::Runtime::SyntaxNode
275
+ # Render a block list to xml. If a block is given,
276
+ # yield to it a builder to insert a listIntroduction node
277
+ def to_xml(b, idprefix, i=0, &block)
278
+ id = idprefix + "list#{i}"
279
+ idprefix = id + '.'
280
+
281
+ b.blockList(id: id) { |b|
282
+ b.listIntroduction { |b| yield b } if block_given?
283
+
284
+ elements.each { |e| e.to_xml(b, idprefix) }
285
+ }
286
+ end
287
+ end
288
+
289
+ class BlocklistItem < Treetop::Runtime::SyntaxNode
290
+ def num
291
+ blocklist_item_prefix.text_value
292
+ end
293
+
294
+ def to_xml(b, idprefix)
295
+ b.item(id: idprefix + num.gsub(/[()]/, '')) { |b|
296
+ b.num(num)
297
+ b.p { |b|
298
+ item_content.clauses.to_xml(b, idprefix) if respond_to? :item_content and item_content.respond_to? :clauses
299
+ }
300
+ }
301
+ end
302
+ end
303
+
304
+ end
305
+ end
306
+ end
307
+ end
@@ -23,11 +23,12 @@ module Slaw
23
23
  include Slaw::Namespace
24
24
  include Slaw::Logging
25
25
 
26
- @@parsers = {}
27
-
28
26
  # Additional hash of options to be provided to the parser when parsing.
29
27
  attr_accessor :parse_options
30
28
 
29
+ # The parser to use
30
+ attr_accessor :parser
31
+
31
32
  # Prefix to use when generating IDs for fragments
32
33
  attr_accessor :fragment_id_prefix
33
34
 
@@ -36,26 +37,10 @@ module Slaw
36
37
  # Specify either `:parser` or `:grammar_file` and `:grammar_class`.
37
38
  #
38
39
  # @option opts [Treetop::Runtime::CompiledParser] :parser parser to use
39
- # @option opts [String] :grammar_file grammar filename to load a parser from
40
- # @option opts [String] :grammar_class name of the class that the grammar will generate
40
+ # @option opts Hash :parse_options options to parse to the parser
41
41
  def initialize(opts={})
42
- if opts[:parser]
43
- @parser = opts[:parser]
44
- elsif opts[:grammar_file] and opts[:grammar_class]
45
- if @@parsers[opts[:grammar_class]]
46
- # already compiled the grammar, just use it
47
- @parser = @@parsers[opts[:grammar_class]]
48
- else
49
- # load the grammar
50
- Treetop.load(opts[:grammar_file])
51
- cls = eval(opts[:grammar_class])
52
- @parser = cls.new
53
- end
54
- else
55
- raise ArgumentError.new("Specify either :parser or :grammar_file and :grammar_class")
56
- end
57
-
58
- @parse_options = {}
42
+ @parser = opts[:parser]
43
+ @parse_options = opts[:parse_optiosn] || {}
59
44
  end
60
45
 
61
46
  # Do all the work necessary to parse text into a well-formed XML document.
@@ -167,7 +152,6 @@ module Slaw
167
152
  # @return [Nokogiri::XML::Document] the updated document
168
153
  def postprocess(doc)
169
154
  normalise_headings(doc)
170
- find_short_title(doc)
171
155
  adjust_blocklists(doc)
172
156
 
173
157
  doc
@@ -189,186 +173,6 @@ module Slaw
189
173
  end
190
174
  end
191
175
 
192
- # Find the short title and add it as an FRBRalias element in the meta section
193
- #
194
- # @param doc [Nokogiri::XML::Document]
195
- def find_short_title(doc)
196
- logger.info("Finding short title")
197
-
198
- # Short title and commencement
199
- # 8. This Act shall be called the Legal Aid Amendment Act, 1996, and shall come
200
- # into operation on a date fixed by the President by proclamation in the Gazette.
201
-
202
- doc.xpath('//a:body//a:heading[contains(text(), "hort title")]', a: NS).each do |heading|
203
- section = heading.parent.at_xpath('a:subsection', a: NS)
204
- if section and section.text =~ /this act (is|shall be called) the (([a-zA-Z\(\)]\s*)+, \d\d\d\d)/i
205
- short_title = $2
206
-
207
- logger.info("+ Found title: #{short_title}")
208
-
209
- node = doc.at_xpath('//a:meta//a:FRBRalias', a: NS)
210
- node['value'] = short_title
211
- break
212
- end
213
- end
214
- end
215
-
216
- # Find definitions of terms and introduce them into the
217
- # meta section of the document.
218
- #
219
- # @param doc [Nokogiri::XML::Document]
220
- def link_definitions(doc)
221
- logger.info("Finding and linking definitions")
222
-
223
- terms = find_definitions(doc)
224
- add_terms_to_references(doc, terms)
225
- find_term_references(doc, terms)
226
- renumber_terms(doc)
227
- end
228
-
229
- # Find `def` elements in the document and return a Hash from
230
- # term ids to the text of each term
231
- #
232
- # @param doc [Nokogiri::XML::Document]
233
- #
234
- # @return [Hash{String, String}]
235
- def find_definitions(doc)
236
- guess_at_definitions(doc)
237
-
238
- terms = {}
239
- doc.xpath('//a:def', a: NS).each do |defn|
240
- # <p>"<def refersTo="#term-affected_land">affected land</def>" means land in respect of which an application has been lodged in terms of section 17(1);</p>
241
- if defn['refersTo']
242
- id = defn['refersTo'].sub(/^#/, '')
243
- term = defn.content
244
- terms[id] = term
245
-
246
- logger.info("+ Found definition for: #{term}")
247
- end
248
- end
249
-
250
- terms
251
- end
252
-
253
- # Find defined terms in the document.
254
- #
255
- # This looks for heading elements with the words 'definitions' or 'interpretation',
256
- # and then looks for phrases like
257
- #
258
- # "this word" means something...
259
- #
260
- # It identifies "this word" as a defined term and wraps it in a def tag with a refersTo
261
- # attribute referencing the term being defined. The surrounding block
262
- # structure is also has its refersTo attribute set to the term. This way, the term
263
- # is both marked as defined, and the container element with the full
264
- # definition of the term is identified.
265
- def guess_at_definitions(doc)
266
- doc.xpath('//a:section', a: NS).select do |section|
267
- # sections with headings like Definitions
268
- heading = section.at_xpath('a:heading', a: NS)
269
- heading && heading.content =~ /definition|interpretation/i
270
- end.each do |section|
271
- # find items like "foo" means blah...
272
-
273
- section.xpath('.//a:p|.//a:listIntroduction', a: NS).each do |container|
274
- # only if we don't already have a definition here
275
- next if container.at_xpath('a:def', a: NS)
276
-
277
- # get first text node
278
- text = container.children.first
279
- next if (not text or not text.text?)
280
-
281
- match = /^\s*["“”](.+?)["“”]/.match(text.text)
282
- if match
283
- term = match.captures[0]
284
- term_id = 'term-' + term.gsub(/[^a-zA-Z0-9_-]/, '_')
285
-
286
- # <p>"<def refersTo="#term-affected_land">affected land</def>" means land in respect of which an application has been lodged in terms of section 17(1);</p>
287
- refersTo = "##{term_id}"
288
- defn = doc.create_element('def', term, refersTo: refersTo)
289
- rest = match.post_match
290
-
291
- text.before(defn)
292
- defn.before(doc.create_text_node('"'))
293
- text.content = '"' + rest
294
-
295
- # adjust the container's refersTo attribute
296
- parent = find_up(container, ['item', 'point', 'blockList', 'list', 'paragraph', 'subsection', 'section', 'chapter', 'part'])
297
- parent['refersTo'] = refersTo
298
- end
299
- end
300
- end
301
- end
302
-
303
- def add_terms_to_references(doc, terms)
304
- refs = doc.at_xpath('//a:meta/a:references', a: NS)
305
- unless refs
306
- refs = doc.create_element('references', source: "#this")
307
- doc.at_xpath('//a:meta/a:identification', a: NS).after(refs)
308
- end
309
-
310
- # nuke all existing term reference elements
311
- refs.xpath('a:TLCTerm', a: NS).each { |el| el.remove }
312
-
313
- for id, term in terms
314
- # <TLCTerm id="term-applicant" href="/ontology/term/this.eng.applicant" showAs="Applicant"/>
315
- refs << doc.create_element('TLCTerm',
316
- id: id,
317
- href: "/ontology/term/this.eng.#{id.gsub(/^term-/, '')}",
318
- showAs: term)
319
- end
320
- end
321
-
322
- # Find and decorate references to terms in the document.
323
- # The +terms+ param is a hash from term_id to actual term.
324
- def find_term_references(doc, terms)
325
- logger.info("+ Finding references to terms")
326
-
327
- i = 0
328
-
329
- # sort terms by the length of the defined term, desc,
330
- # so that we don't find short terms inside longer
331
- # terms
332
- terms = terms.to_a.sort_by { |pair| -pair[1].size }
333
-
334
- # look for each term
335
- for term_id, term in terms
336
- doc.xpath('//a:body//text()', a: NS).each do |text|
337
- # replace all occurrences in this text node
338
-
339
- # unless we're already inside a def or term element
340
- next if (["def", "term"].include?(text.parent.name))
341
-
342
- # don't link to a term inside its own definition
343
- owner = find_up(text, 'subsection')
344
- next if owner and owner.at_xpath(".//a:def[@refersTo='##{term_id}']", a: NS)
345
-
346
- while posn = (text.content =~ /\b#{Regexp::escape(term)}\b/)
347
- # <p>A delegation under subsection (1) shall not prevent the <term refersTo="#term-Minister" id="trm357">Minister</term> from exercising the power himself or herself.</p>
348
- node = doc.create_element('term', term, refersTo: "##{term_id}", id: "trm#{i}")
349
-
350
- pre = (posn > 0) ? text.content[0..posn-1] : nil
351
- post = text.content[posn+term.length..-1]
352
-
353
- text.before(node)
354
- node.before(doc.create_text_node(pre)) if pre
355
- text.content = post
356
-
357
- i += 1
358
- end
359
- end
360
- end
361
- end
362
-
363
- # recalculate ids for <term> elements
364
- def renumber_terms(doc)
365
- logger.info("Renumbering terms")
366
-
367
- doc.xpath('//a:term', a: NS).each_with_index do |term, i|
368
- term['id'] = "trm#{i}"
369
- end
370
- end
371
-
372
176
  # Adjust blocklists:
373
177
  #
374
178
  # - nest them correctly