slaw 0.17.2 → 1.0.0.alpha.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,307 @@
1
+ require 'slaw/grammars/core_nodes'
2
+
3
+ module Slaw
4
+ module Grammars
5
+ module ZA
6
+ module Act
7
+ class Act < Treetop::Runtime::SyntaxNode
8
+ FRBR_URI = '/za/act/1980/01'
9
+ WORK_URI = FRBR_URI
10
+ EXPRESSION_URI = "#{FRBR_URI}/eng@"
11
+ MANIFESTATION_URI = EXPRESSION_URI
12
+
13
+ def to_xml(b, idprefix=nil, i=0)
14
+ b.act(contains: "originalVersion") { |b|
15
+ write_meta(b)
16
+ write_preface(b)
17
+ write_preamble(b)
18
+ write_body(b)
19
+ }
20
+ write_schedules(b)
21
+ end
22
+
23
+ def write_meta(b)
24
+ b.meta { |b|
25
+ write_identification(b)
26
+
27
+ b.references(source: "#this") {
28
+ b.TLCOrganization(id: 'slaw', href: 'https://github.com/longhotsummer/slaw', showAs: "Slaw")
29
+ b.TLCOrganization(id: 'council', href: '/ontology/organization/za/council', showAs: "Council")
30
+ }
31
+ }
32
+ end
33
+
34
+ def write_identification(b)
35
+ b.identification(source: "#slaw") { |b|
36
+ # use stub values so that we can generate a validating document
37
+ b.FRBRWork { |b|
38
+ b.FRBRthis(value: "#{WORK_URI}/main")
39
+ b.FRBRuri(value: WORK_URI)
40
+ b.FRBRalias(value: 'Short Title')
41
+ b.FRBRdate(date: '1980-01-01', name: 'Generation')
42
+ b.FRBRauthor(href: '#council')
43
+ b.FRBRcountry(value: 'za')
44
+ }
45
+ b.FRBRExpression { |b|
46
+ b.FRBRthis(value: "#{EXPRESSION_URI}/main")
47
+ b.FRBRuri(value: EXPRESSION_URI)
48
+ b.FRBRdate(date: '1980-01-01', name: 'Generation')
49
+ b.FRBRauthor(href: '#council')
50
+ b.FRBRlanguage(language: 'eng')
51
+ }
52
+ b.FRBRManifestation { |b|
53
+ b.FRBRthis(value: "#{MANIFESTATION_URI}/main")
54
+ b.FRBRuri(value: MANIFESTATION_URI)
55
+ b.FRBRdate(date: Time.now.strftime('%Y-%m-%d'), name: 'Generation')
56
+ b.FRBRauthor(href: '#slaw')
57
+ }
58
+ }
59
+ end
60
+
61
+ def write_preface(b)
62
+ preface.to_xml(b) if preface.respond_to? :to_xml
63
+ end
64
+
65
+ def write_preamble(b)
66
+ preamble.to_xml(b) if preamble.respond_to? :to_xml
67
+ end
68
+
69
+ def write_body(b)
70
+ body.to_xml(b)
71
+ end
72
+
73
+ def write_schedules(b)
74
+ if schedules.text_value != ""
75
+ schedules.to_xml(b)
76
+ end
77
+ end
78
+ end
79
+
80
+ class Preface < Treetop::Runtime::SyntaxNode
81
+ def to_xml(b, *args)
82
+ if text_value != ""
83
+ b.preface { |b|
84
+ statements.elements.each { |element|
85
+ for e in element.elements
86
+ e.to_xml(b, "") if e.is_a? Slaw::Grammars::Inlines::NakedStatement
87
+ end
88
+ }
89
+ }
90
+ end
91
+ end
92
+ end
93
+
94
+ class Preamble < Treetop::Runtime::SyntaxNode
95
+ def to_xml(b, *args)
96
+ if text_value != ""
97
+ b.preamble { |b|
98
+ statements.elements.each { |e|
99
+ e.to_xml(b, "")
100
+ }
101
+ }
102
+ end
103
+ end
104
+ end
105
+
106
+ class Part < Treetop::Runtime::SyntaxNode
107
+ def num
108
+ heading.num
109
+ end
110
+
111
+ def to_xml(b, *args)
112
+ id = "part-#{num}"
113
+
114
+ # include a chapter number in the id if our parent has one
115
+ if parent and parent.parent.is_a?(Chapter) and parent.parent.num
116
+ id = "chapter-#{parent.parent.num}.#{id}"
117
+ end
118
+
119
+ b.part(id: id) { |b|
120
+ heading.to_xml(b)
121
+ children.elements.each_with_index { |e, i| e.to_xml(b, id + '.', i) }
122
+ }
123
+ end
124
+ end
125
+
126
+ class PartHeading < Treetop::Runtime::SyntaxNode
127
+ def num
128
+ part_heading_prefix.alphanums.text_value
129
+ end
130
+
131
+ def title
132
+ if heading.text_value and heading.respond_to? :content
133
+ heading.content.text_value.strip
134
+ end
135
+ end
136
+
137
+ def to_xml(b)
138
+ b.num(num)
139
+ b.heading(title) if title
140
+ end
141
+ end
142
+
143
+ class Chapter < Treetop::Runtime::SyntaxNode
144
+ def num
145
+ heading.num
146
+ end
147
+
148
+ def to_xml(b, *args)
149
+ id = "chapter-#{num}"
150
+
151
+ # include a part number in the id if our parent has one
152
+ if parent and parent.parent.is_a?(Part) and parent.parent.num
153
+ id = "part-#{parent.parent.num}.#{id}"
154
+ end
155
+
156
+ b.chapter(id: id) { |b|
157
+ heading.to_xml(b)
158
+ children.elements.each_with_index { |e, i| e.to_xml(b, id + '.', i) }
159
+ }
160
+ end
161
+ end
162
+
163
+ class ChapterHeading < Treetop::Runtime::SyntaxNode
164
+ def num
165
+ chapter_heading_prefix.alphanums.text_value
166
+ end
167
+
168
+ def title
169
+ if heading.text_value and heading.respond_to? :content
170
+ heading.content.text_value.strip
171
+ end
172
+ end
173
+
174
+ def to_xml(b)
175
+ b.num(num)
176
+ b.heading(title) if title
177
+ end
178
+ end
179
+
180
+ class Section < Treetop::Runtime::SyntaxNode
181
+ def num
182
+ section_title.num
183
+ end
184
+
185
+ def title
186
+ section_title.title
187
+ end
188
+
189
+ def to_xml(b, *args)
190
+ id = "section-#{num}"
191
+ b.section(id: id) { |b|
192
+ b.num("#{num}.")
193
+ b.heading(title)
194
+
195
+ idprefix = "#{id}."
196
+
197
+ children.elements.each_with_index { |e, i| e.to_xml(b, idprefix, i) }
198
+ }
199
+ end
200
+ end
201
+
202
+ class SectionTitleType1 < Treetop::Runtime::SyntaxNode
203
+ # a section title of the form:
204
+ #
205
+ # Definitions
206
+ # 1. In this act...
207
+
208
+ def num
209
+ section_title_prefix.number_letter.text_value
210
+ end
211
+
212
+ def title
213
+ content.text_value
214
+ end
215
+ end
216
+
217
+ class SectionTitleType2 < Treetop::Runtime::SyntaxNode
218
+ # a section title of the form:
219
+ #
220
+ # 1. Definitions
221
+ # In this act...
222
+ #
223
+ # In this format, the title is optional and the section content may
224
+ # start where we think the title is.
225
+
226
+ def num
227
+ section_title_prefix.number_letter.text_value
228
+ end
229
+
230
+ def title
231
+ section_title.empty? ? "" : section_title.content.text_value
232
+ end
233
+ end
234
+
235
+ class BlockParagraph < Treetop::Runtime::SyntaxNode
236
+ def to_xml(b, idprefix='', i=0)
237
+ id = "#{idprefix}paragraph-0"
238
+ idprefix = "#{id}."
239
+
240
+ b.paragraph(id: id) { |b|
241
+ b.content { |b|
242
+ elements.each_with_index { |e, i| e.to_xml(b, idprefix, i) }
243
+ }
244
+ }
245
+ end
246
+ end
247
+
248
+ class Subsection < Treetop::Runtime::SyntaxNode
249
+ def num
250
+ subsection_prefix.num.text_value
251
+ end
252
+
253
+ def to_xml(b, idprefix, i)
254
+ id = idprefix + num.gsub(/[()]/, '')
255
+ idprefix = id + "."
256
+
257
+ kids = children.elements
258
+ kids = [first_child] + kids if first_child and !first_child.empty?
259
+
260
+ b.subsection(id: id) { |b|
261
+ b.num(num)
262
+ b.content { |b|
263
+ if kids.empty?
264
+ # schema requires a non-empty content element
265
+ b.p
266
+ else
267
+ kids.each_with_index { |e, i| e.to_xml(b, idprefix, i) }
268
+ end
269
+ }
270
+ }
271
+ end
272
+ end
273
+
274
+ class Blocklist < Treetop::Runtime::SyntaxNode
275
+ # Render a block list to xml. If a block is given,
276
+ # yield to it a builder to insert a listIntroduction node
277
+ def to_xml(b, idprefix, i=0, &block)
278
+ id = idprefix + "list#{i}"
279
+ idprefix = id + '.'
280
+
281
+ b.blockList(id: id) { |b|
282
+ b.listIntroduction { |b| yield b } if block_given?
283
+
284
+ elements.each { |e| e.to_xml(b, idprefix) }
285
+ }
286
+ end
287
+ end
288
+
289
+ class BlocklistItem < Treetop::Runtime::SyntaxNode
290
+ def num
291
+ blocklist_item_prefix.text_value
292
+ end
293
+
294
+ def to_xml(b, idprefix)
295
+ b.item(id: idprefix + num.gsub(/[()]/, '')) { |b|
296
+ b.num(num)
297
+ b.p { |b|
298
+ item_content.clauses.to_xml(b, idprefix) if respond_to? :item_content and item_content.respond_to? :clauses
299
+ }
300
+ }
301
+ end
302
+ end
303
+
304
+ end
305
+ end
306
+ end
307
+ end
@@ -23,11 +23,12 @@ module Slaw
23
23
  include Slaw::Namespace
24
24
  include Slaw::Logging
25
25
 
26
- @@parsers = {}
27
-
28
26
  # Additional hash of options to be provided to the parser when parsing.
29
27
  attr_accessor :parse_options
30
28
 
29
+ # The parser to use
30
+ attr_accessor :parser
31
+
31
32
  # Prefix to use when generating IDs for fragments
32
33
  attr_accessor :fragment_id_prefix
33
34
 
@@ -36,26 +37,10 @@ module Slaw
36
37
  # Specify either `:parser` or `:grammar_file` and `:grammar_class`.
37
38
  #
38
39
  # @option opts [Treetop::Runtime::CompiledParser] :parser parser to use
39
- # @option opts [String] :grammar_file grammar filename to load a parser from
40
- # @option opts [String] :grammar_class name of the class that the grammar will generate
40
+ # @option opts Hash :parse_options options to parse to the parser
41
41
  def initialize(opts={})
42
- if opts[:parser]
43
- @parser = opts[:parser]
44
- elsif opts[:grammar_file] and opts[:grammar_class]
45
- if @@parsers[opts[:grammar_class]]
46
- # already compiled the grammar, just use it
47
- @parser = @@parsers[opts[:grammar_class]]
48
- else
49
- # load the grammar
50
- Treetop.load(opts[:grammar_file])
51
- cls = eval(opts[:grammar_class])
52
- @parser = cls.new
53
- end
54
- else
55
- raise ArgumentError.new("Specify either :parser or :grammar_file and :grammar_class")
56
- end
57
-
58
- @parse_options = {}
42
+ @parser = opts[:parser]
43
+ @parse_options = opts[:parse_optiosn] || {}
59
44
  end
60
45
 
61
46
  # Do all the work necessary to parse text into a well-formed XML document.
@@ -167,7 +152,6 @@ module Slaw
167
152
  # @return [Nokogiri::XML::Document] the updated document
168
153
  def postprocess(doc)
169
154
  normalise_headings(doc)
170
- find_short_title(doc)
171
155
  adjust_blocklists(doc)
172
156
 
173
157
  doc
@@ -189,186 +173,6 @@ module Slaw
189
173
  end
190
174
  end
191
175
 
192
- # Find the short title and add it as an FRBRalias element in the meta section
193
- #
194
- # @param doc [Nokogiri::XML::Document]
195
- def find_short_title(doc)
196
- logger.info("Finding short title")
197
-
198
- # Short title and commencement
199
- # 8. This Act shall be called the Legal Aid Amendment Act, 1996, and shall come
200
- # into operation on a date fixed by the President by proclamation in the Gazette.
201
-
202
- doc.xpath('//a:body//a:heading[contains(text(), "hort title")]', a: NS).each do |heading|
203
- section = heading.parent.at_xpath('a:subsection', a: NS)
204
- if section and section.text =~ /this act (is|shall be called) the (([a-zA-Z\(\)]\s*)+, \d\d\d\d)/i
205
- short_title = $2
206
-
207
- logger.info("+ Found title: #{short_title}")
208
-
209
- node = doc.at_xpath('//a:meta//a:FRBRalias', a: NS)
210
- node['value'] = short_title
211
- break
212
- end
213
- end
214
- end
215
-
216
- # Find definitions of terms and introduce them into the
217
- # meta section of the document.
218
- #
219
- # @param doc [Nokogiri::XML::Document]
220
- def link_definitions(doc)
221
- logger.info("Finding and linking definitions")
222
-
223
- terms = find_definitions(doc)
224
- add_terms_to_references(doc, terms)
225
- find_term_references(doc, terms)
226
- renumber_terms(doc)
227
- end
228
-
229
- # Find `def` elements in the document and return a Hash from
230
- # term ids to the text of each term
231
- #
232
- # @param doc [Nokogiri::XML::Document]
233
- #
234
- # @return [Hash{String, String}]
235
- def find_definitions(doc)
236
- guess_at_definitions(doc)
237
-
238
- terms = {}
239
- doc.xpath('//a:def', a: NS).each do |defn|
240
- # <p>"<def refersTo="#term-affected_land">affected land</def>" means land in respect of which an application has been lodged in terms of section 17(1);</p>
241
- if defn['refersTo']
242
- id = defn['refersTo'].sub(/^#/, '')
243
- term = defn.content
244
- terms[id] = term
245
-
246
- logger.info("+ Found definition for: #{term}")
247
- end
248
- end
249
-
250
- terms
251
- end
252
-
253
- # Find defined terms in the document.
254
- #
255
- # This looks for heading elements with the words 'definitions' or 'interpretation',
256
- # and then looks for phrases like
257
- #
258
- # "this word" means something...
259
- #
260
- # It identifies "this word" as a defined term and wraps it in a def tag with a refersTo
261
- # attribute referencing the term being defined. The surrounding block
262
- # structure is also has its refersTo attribute set to the term. This way, the term
263
- # is both marked as defined, and the container element with the full
264
- # definition of the term is identified.
265
- def guess_at_definitions(doc)
266
- doc.xpath('//a:section', a: NS).select do |section|
267
- # sections with headings like Definitions
268
- heading = section.at_xpath('a:heading', a: NS)
269
- heading && heading.content =~ /definition|interpretation/i
270
- end.each do |section|
271
- # find items like "foo" means blah...
272
-
273
- section.xpath('.//a:p|.//a:listIntroduction', a: NS).each do |container|
274
- # only if we don't already have a definition here
275
- next if container.at_xpath('a:def', a: NS)
276
-
277
- # get first text node
278
- text = container.children.first
279
- next if (not text or not text.text?)
280
-
281
- match = /^\s*["“”](.+?)["“”]/.match(text.text)
282
- if match
283
- term = match.captures[0]
284
- term_id = 'term-' + term.gsub(/[^a-zA-Z0-9_-]/, '_')
285
-
286
- # <p>"<def refersTo="#term-affected_land">affected land</def>" means land in respect of which an application has been lodged in terms of section 17(1);</p>
287
- refersTo = "##{term_id}"
288
- defn = doc.create_element('def', term, refersTo: refersTo)
289
- rest = match.post_match
290
-
291
- text.before(defn)
292
- defn.before(doc.create_text_node('"'))
293
- text.content = '"' + rest
294
-
295
- # adjust the container's refersTo attribute
296
- parent = find_up(container, ['item', 'point', 'blockList', 'list', 'paragraph', 'subsection', 'section', 'chapter', 'part'])
297
- parent['refersTo'] = refersTo
298
- end
299
- end
300
- end
301
- end
302
-
303
- def add_terms_to_references(doc, terms)
304
- refs = doc.at_xpath('//a:meta/a:references', a: NS)
305
- unless refs
306
- refs = doc.create_element('references', source: "#this")
307
- doc.at_xpath('//a:meta/a:identification', a: NS).after(refs)
308
- end
309
-
310
- # nuke all existing term reference elements
311
- refs.xpath('a:TLCTerm', a: NS).each { |el| el.remove }
312
-
313
- for id, term in terms
314
- # <TLCTerm id="term-applicant" href="/ontology/term/this.eng.applicant" showAs="Applicant"/>
315
- refs << doc.create_element('TLCTerm',
316
- id: id,
317
- href: "/ontology/term/this.eng.#{id.gsub(/^term-/, '')}",
318
- showAs: term)
319
- end
320
- end
321
-
322
- # Find and decorate references to terms in the document.
323
- # The +terms+ param is a hash from term_id to actual term.
324
- def find_term_references(doc, terms)
325
- logger.info("+ Finding references to terms")
326
-
327
- i = 0
328
-
329
- # sort terms by the length of the defined term, desc,
330
- # so that we don't find short terms inside longer
331
- # terms
332
- terms = terms.to_a.sort_by { |pair| -pair[1].size }
333
-
334
- # look for each term
335
- for term_id, term in terms
336
- doc.xpath('//a:body//text()', a: NS).each do |text|
337
- # replace all occurrences in this text node
338
-
339
- # unless we're already inside a def or term element
340
- next if (["def", "term"].include?(text.parent.name))
341
-
342
- # don't link to a term inside its own definition
343
- owner = find_up(text, 'subsection')
344
- next if owner and owner.at_xpath(".//a:def[@refersTo='##{term_id}']", a: NS)
345
-
346
- while posn = (text.content =~ /\b#{Regexp::escape(term)}\b/)
347
- # <p>A delegation under subsection (1) shall not prevent the <term refersTo="#term-Minister" id="trm357">Minister</term> from exercising the power himself or herself.</p>
348
- node = doc.create_element('term', term, refersTo: "##{term_id}", id: "trm#{i}")
349
-
350
- pre = (posn > 0) ? text.content[0..posn-1] : nil
351
- post = text.content[posn+term.length..-1]
352
-
353
- text.before(node)
354
- node.before(doc.create_text_node(pre)) if pre
355
- text.content = post
356
-
357
- i += 1
358
- end
359
- end
360
- end
361
- end
362
-
363
- # recalculate ids for <term> elements
364
- def renumber_terms(doc)
365
- logger.info("Renumbering terms")
366
-
367
- doc.xpath('//a:term', a: NS).each_with_index do |term, i|
368
- term['id'] = "trm#{i}"
369
- end
370
- end
371
-
372
176
  # Adjust blocklists:
373
177
  #
374
178
  # - nest them correctly