RubyGems - slaw - Versions diffs - 0.17.2 → 1.0.0.alpha.1 - Mend

slaw 0.17.2 → 1.0.0.alpha.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (29) hide show

checksums.yaml +4 -4
data/README.md +9 -2
data/bin/slaw +2 -19
data/lib/slaw/generator.rb +21 -6
data/lib/slaw/grammars/core_nodes.rb +17 -0
data/lib/slaw/grammars/inlines.treetop +45 -0
data/lib/slaw/grammars/inlines_nodes.rb +58 -0
data/lib/slaw/grammars/pl/act.treetop +246 -0
data/lib/slaw/grammars/pl/act_nodes.rb +469 -0
data/lib/slaw/grammars/schedules.treetop +33 -0
data/lib/slaw/grammars/schedules_nodes.rb +107 -0
data/lib/slaw/grammars/tables.treetop +59 -0
data/lib/slaw/grammars/tables_nodes.rb +74 -0
data/lib/slaw/grammars/terminals.treetop +84 -0
data/lib/slaw/grammars/za/act.treetop +222 -0
data/lib/slaw/grammars/za/act_nodes.rb +307 -0
data/lib/slaw/{za → grammars/za}/act_text.xsl +0 -0
data/lib/slaw/parse/builder.rb +6 -202
data/lib/slaw/version.rb +1 -1
data/spec/generator_spec.rb +2 -0
data/spec/parse/builder_spec.rb +0 -48
data/spec/pl/act_block_spec.rb +449 -0
data/spec/za/act_block_spec.rb +5 -3
data/spec/za/act_inline_spec.rb +2 -0
data/spec/za/act_schedules_spec.rb +2 -0
data/spec/za/act_table_spec.rb +2 -0
metadata +19 -7
data/lib/slaw/za/act.treetop +0 -393
data/lib/slaw/za/act_nodes.rb +0 -532

data/lib/slaw/grammars/za/act_nodes.rb ADDED

@@ -0,0 +1,307 @@
+require 'slaw/grammars/core_nodes'
+module Slaw
+  module Grammars
+    module ZA
+      module Act
+        class Act < Treetop::Runtime::SyntaxNode
+          FRBR_URI = '/za/act/1980/01'
+          WORK_URI = FRBR_URI
+          EXPRESSION_URI = "#{FRBR_URI}/eng@"
+          MANIFESTATION_URI = EXPRESSION_URI
+          def to_xml(b, idprefix=nil, i=0)
+            b.act(contains: "originalVersion") { |b|
+              write_meta(b)
+              write_preface(b)
+              write_preamble(b)
+              write_body(b)
+            }
+            write_schedules(b)
+          end
+          def write_meta(b)
+            b.meta { |b|
+              write_identification(b)
+              b.references(source: "#this") {
+                b.TLCOrganization(id: 'slaw', href: 'https://github.com/longhotsummer/slaw', showAs: "Slaw")
+                b.TLCOrganization(id: 'council', href: '/ontology/organization/za/council', showAs: "Council")
+              }
+            }
+          end
+          def write_identification(b)
+            b.identification(source: "#slaw") { |b|
+              # use stub values so that we can generate a validating document
+              b.FRBRWork { |b|
+                b.FRBRthis(value: "#{WORK_URI}/main")
+                b.FRBRuri(value: WORK_URI)
+                b.FRBRalias(value: 'Short Title')
+                b.FRBRdate(date: '1980-01-01', name: 'Generation')
+                b.FRBRauthor(href: '#council')
+                b.FRBRcountry(value: 'za')
+              }
+              b.FRBRExpression { |b|
+                b.FRBRthis(value: "#{EXPRESSION_URI}/main")
+                b.FRBRuri(value: EXPRESSION_URI)
+                b.FRBRdate(date: '1980-01-01', name: 'Generation')
+                b.FRBRauthor(href: '#council')
+                b.FRBRlanguage(language: 'eng')
+              }
+              b.FRBRManifestation { |b|
+                b.FRBRthis(value: "#{MANIFESTATION_URI}/main")
+                b.FRBRuri(value: MANIFESTATION_URI)
+                b.FRBRdate(date: Time.now.strftime('%Y-%m-%d'), name: 'Generation')
+                b.FRBRauthor(href: '#slaw')
+              }
+            }
+          end
+          def write_preface(b)
+            preface.to_xml(b) if preface.respond_to? :to_xml
+          end
+          def write_preamble(b)
+            preamble.to_xml(b) if preamble.respond_to? :to_xml
+          end
+          def write_body(b)
+            body.to_xml(b)
+          end
+          def write_schedules(b)
+            if schedules.text_value != ""
+              schedules.to_xml(b)
+            end
+          end
+        end
+        class Preface < Treetop::Runtime::SyntaxNode
+          def to_xml(b, *args)
+            if text_value != ""
+              b.preface { |b|
+                statements.elements.each { |element|
+                  for e in element.elements
+                    e.to_xml(b, "") if e.is_a? Slaw::Grammars::Inlines::NakedStatement
+                  end
+                }
+              }
+            end
+          end
+        end
+        class Preamble < Treetop::Runtime::SyntaxNode
+          def to_xml(b, *args)
+            if text_value != ""
+              b.preamble { |b|
+                statements.elements.each { |e|
+                  e.to_xml(b, "")
+                }
+              }
+            end
+          end
+        end
+        class Part < Treetop::Runtime::SyntaxNode
+          def num
+            heading.num
+          end
+          def to_xml(b, *args)
+            id = "part-#{num}"
+            # include a chapter number in the id if our parent has one
+            if parent and parent.parent.is_a?(Chapter) and parent.parent.num
+              id = "chapter-#{parent.parent.num}.#{id}"
+            end
+            b.part(id: id) { |b|
+              heading.to_xml(b)
+              children.elements.each_with_index { |e, i| e.to_xml(b, id + '.', i) }
+            }
+          end
+        end
+        class PartHeading < Treetop::Runtime::SyntaxNode
+          def num
+            part_heading_prefix.alphanums.text_value
+          end
+          def title
+            if heading.text_value and heading.respond_to? :content
+              heading.content.text_value.strip
+            end
+          end
+          def to_xml(b)
+            b.num(num)
+            b.heading(title) if title
+          end
+        end
+        class Chapter < Treetop::Runtime::SyntaxNode
+          def num
+            heading.num
+          end
+          def to_xml(b, *args)
+            id = "chapter-#{num}"
+            # include a part number in the id if our parent has one
+            if parent and parent.parent.is_a?(Part) and parent.parent.num
+              id = "part-#{parent.parent.num}.#{id}"
+            end
+            b.chapter(id: id) { |b|
+              heading.to_xml(b)
+              children.elements.each_with_index { |e, i| e.to_xml(b, id + '.', i) }
+            }
+          end
+        end
+        class ChapterHeading < Treetop::Runtime::SyntaxNode
+          def num
+            chapter_heading_prefix.alphanums.text_value
+          end
+          def title
+            if heading.text_value and heading.respond_to? :content
+              heading.content.text_value.strip
+            end
+          end
+          def to_xml(b)
+            b.num(num)
+            b.heading(title) if title
+          end
+        end
+        class Section < Treetop::Runtime::SyntaxNode
+          def num
+            section_title.num
+          end
+          def title
+            section_title.title
+          end
+          def to_xml(b, *args)
+            id = "section-#{num}"
+            b.section(id: id) { |b|
+              b.num("#{num}.")
+              b.heading(title)
+              idprefix = "#{id}."
+              children.elements.each_with_index { |e, i| e.to_xml(b, idprefix, i) }
+            }
+          end
+        end
+        class SectionTitleType1 < Treetop::Runtime::SyntaxNode
+          # a section title of the form:
+          #
+          # Definitions
+          # 1. In this act...
+          def num
+            section_title_prefix.number_letter.text_value
+          end
+          def title
+            content.text_value
+          end
+        end
+        class SectionTitleType2 < Treetop::Runtime::SyntaxNode
+          # a section title of the form:
+          #
+          # 1. Definitions
+          # In this act...
+          #
+          # In this format, the title is optional and the section content may
+          # start where we think the title is.
+          def num
+            section_title_prefix.number_letter.text_value
+          end
+          def title
+            section_title.empty? ? "" : section_title.content.text_value
+          end
+        end
+        class BlockParagraph < Treetop::Runtime::SyntaxNode
+          def to_xml(b, idprefix='', i=0)
+            id = "#{idprefix}paragraph-0"
+            idprefix = "#{id}."
+            b.paragraph(id: id) { |b|
+              b.content { |b|
+                elements.each_with_index { |e, i| e.to_xml(b, idprefix, i) }
+              }
+            }
+          end
+        end
+        class Subsection < Treetop::Runtime::SyntaxNode
+          def num
+            subsection_prefix.num.text_value
+          end
+          def to_xml(b, idprefix, i)
+            id = idprefix + num.gsub(/[()]/, '')
+            idprefix = id + "."
+            kids = children.elements
+            kids = [first_child] + kids if first_child and !first_child.empty?
+            b.subsection(id: id) { |b|
+              b.num(num)
+              b.content { |b|
+                if kids.empty?
+                  # schema requires a non-empty content element
+                  b.p
+                else
+                  kids.each_with_index { |e, i| e.to_xml(b, idprefix, i) }
+                end
+              }
+            }
+          end
+        end
+        class Blocklist < Treetop::Runtime::SyntaxNode
+          # Render a block list to xml. If a block is given,
+          # yield to it a builder to insert a listIntroduction node
+          def to_xml(b, idprefix, i=0, &block)
+            id = idprefix + "list#{i}"
+            idprefix = id + '.'
+            b.blockList(id: id) { |b|
+              b.listIntroduction { |b| yield b } if block_given?
+              elements.each { |e| e.to_xml(b, idprefix) }
+            }
+          end
+        end
+        class BlocklistItem < Treetop::Runtime::SyntaxNode
+          def num
+            blocklist_item_prefix.text_value
+          end
+          def to_xml(b, idprefix)
+            b.item(id: idprefix + num.gsub(/[()]/, '')) { |b|
+              b.num(num)
+              b.p { |b|
+                item_content.clauses.to_xml(b, idprefix) if respond_to? :item_content and item_content.respond_to? :clauses
+              }
+            }
+          end
+        end
+      end
+    end
+  end
+end

data/lib/slaw/{za → grammars/za}/act_text.xsl RENAMED

File without changes

data/lib/slaw/parse/builder.rb CHANGED

@@ -23,11 +23,12 @@ module Slaw
       include Slaw::Namespace
       include Slaw::Logging
-      @@parsers = {}
       # Additional hash of options to be provided to the parser when parsing.
       attr_accessor :parse_options
+      # The parser to use
+      attr_accessor :parser
       # Prefix to use when generating IDs for fragments
       attr_accessor :fragment_id_prefix
@@ -36,26 +37,10 @@ module Slaw
       # Specify either `:parser` or `:grammar_file` and `:grammar_class`.
       #
       # @option opts [Treetop::Runtime::CompiledParser] :parser parser to use
-      # @option opts [String] :grammar_file grammar filename to load a parser from
-      # @option opts [String] :grammar_class name of the class that the grammar will generate
+      # @option opts Hash :parse_options options to parse to the parser
       def initialize(opts={})
-        if opts[:parser]
-          @parser = opts[:parser]
-        elsif opts[:grammar_file] and opts[:grammar_class]
-          if @@parsers[opts[:grammar_class]]
-            # already compiled the grammar, just use it
-            @parser = @@parsers[opts[:grammar_class]]
-          else
-            # load the grammar
-            Treetop.load(opts[:grammar_file])
-            cls = eval(opts[:grammar_class])
-            @parser = cls.new
-          end
-        else
-          raise ArgumentError.new("Specify either :parser or :grammar_file and :grammar_class")
-        end
-        @parse_options = {}
+        @parser = opts[:parser]
+        @parse_options = opts[:parse_optiosn] || {}
       end
       # Do all the work necessary to parse text into a well-formed XML document.
@@ -167,7 +152,6 @@ module Slaw
       # @return [Nokogiri::XML::Document] the updated document
       def postprocess(doc)
         normalise_headings(doc)
-        find_short_title(doc)
         adjust_blocklists(doc)
         doc
@@ -189,186 +173,6 @@ module Slaw
         end
       end
-      # Find the short title and add it as an FRBRalias element in the meta section
-      #
-      # @param doc [Nokogiri::XML::Document]
-      def find_short_title(doc)
-        logger.info("Finding short title")
-        # Short title and commencement
-        # 8. This Act shall be called the Legal Aid Amendment Act, 1996, and shall come
-        # into operation on a date fixed by the President by proclamation in the Gazette.
-        doc.xpath('//a:body//a:heading[contains(text(), "hort title")]', a: NS).each do |heading|
-          section = heading.parent.at_xpath('a:subsection', a: NS)
-          if section and section.text =~ /this act (is|shall be called) the (([a-zA-Z\(\)]\s*)+, \d\d\d\d)/i
-            short_title = $2
-            logger.info("+ Found title: #{short_title}")
-            node = doc.at_xpath('//a:meta//a:FRBRalias', a: NS)
-            node['value'] = short_title
-            break
-          end
-        end
-      end
-      # Find definitions of terms and introduce them into the
-      # meta section of the document.
-      #
-      # @param doc [Nokogiri::XML::Document]
-      def link_definitions(doc)
-        logger.info("Finding and linking definitions")
-        terms = find_definitions(doc)
-        add_terms_to_references(doc, terms)
-        find_term_references(doc, terms)
-        renumber_terms(doc)
-      end
-      # Find `def` elements in the document and return a Hash from
-      # term ids to the text of each term
-      #
-      # @param doc [Nokogiri::XML::Document]
-      #
-      # @return [Hash{String, String}]
-      def find_definitions(doc)
-        guess_at_definitions(doc)
-        terms = {}
-        doc.xpath('//a:def', a: NS).each do |defn|
-          # <p>"<def refersTo="#term-affected_land">affected land</def>" means land in respect of which an application has been lodged in terms of section 17(1);</p>
-          if defn['refersTo']
-            id = defn['refersTo'].sub(/^#/, '')
-            term = defn.content
-            terms[id] = term
-            logger.info("+ Found definition for: #{term}")
-          end
-        end
-        terms
-      end
-      # Find defined terms in the document.
-      #
-      # This looks for heading elements with the words 'definitions' or 'interpretation',
-      # and then looks for phrases like
-      #
-      #   "this word" means something...
-      #
-      # It identifies "this word" as a defined term and wraps it in a def tag with a refersTo
-      # attribute referencing the term being defined. The surrounding block
-      # structure is also has its refersTo attribute set to the term. This way, the term
-      # is both marked as defined, and the container element with the full
-      # definition of the term is identified.
-      def guess_at_definitions(doc)
-        doc.xpath('//a:section', a: NS).select do |section|
-          # sections with headings like Definitions
-          heading = section.at_xpath('a:heading', a: NS)
-          heading && heading.content =~ /definition|interpretation/i
-        end.each do |section|
-          # find items like "foo" means blah...
-          section.xpath('.//a:p|.//a:listIntroduction', a: NS).each do |container|
-            # only if we don't already have a definition here
-            next if container.at_xpath('a:def', a: NS)
-            # get first text node
-            text = container.children.first
-            next if (not text or not text.text?)
-            match = /^\s*["“”](.+?)["“”]/.match(text.text)
-            if match
-              term = match.captures[0]
-              term_id = 'term-' + term.gsub(/[^a-zA-Z0-9_-]/, '_')
-              # <p>"<def refersTo="#term-affected_land">affected land</def>" means land in respect of which an application has been lodged in terms of section 17(1);</p>
-              refersTo = "##{term_id}"
-              defn = doc.create_element('def', term, refersTo: refersTo)
-              rest = match.post_match
-              text.before(defn)
-              defn.before(doc.create_text_node('"'))
-              text.content = '"' + rest
-              # adjust the container's refersTo attribute
-              parent = find_up(container, ['item', 'point', 'blockList', 'list', 'paragraph', 'subsection', 'section', 'chapter', 'part'])
-              parent['refersTo'] = refersTo
-            end
-          end
-        end
-      end
-      def add_terms_to_references(doc, terms)
-        refs = doc.at_xpath('//a:meta/a:references', a: NS)
-        unless refs
-          refs = doc.create_element('references', source: "#this")
-          doc.at_xpath('//a:meta/a:identification', a: NS).after(refs)
-        end
-        # nuke all existing term reference elements
-        refs.xpath('a:TLCTerm', a: NS).each { |el| el.remove }
-        for id, term in terms
-          # <TLCTerm id="term-applicant" href="/ontology/term/this.eng.applicant" showAs="Applicant"/>
-          refs << doc.create_element('TLCTerm',
-                                     id: id,
-                                     href: "/ontology/term/this.eng.#{id.gsub(/^term-/, '')}",
-                                     showAs: term)
-        end
-      end
-      # Find and decorate references to terms in the document.
-      # The +terms+ param is a hash from term_id to actual term.
-      def find_term_references(doc, terms)
-        logger.info("+ Finding references to terms")
-        i = 0
-        # sort terms by the length of the defined term, desc,
-        # so that we don't find short terms inside longer
-        # terms
-        terms = terms.to_a.sort_by { |pair| -pair[1].size }
-        # look for each term
-        for term_id, term in terms
-          doc.xpath('//a:body//text()', a: NS).each do |text|
-            # replace all occurrences in this text node
-            # unless we're already inside a def or term element
-            next if (["def", "term"].include?(text.parent.name))
-            # don't link to a term inside its own definition
-            owner = find_up(text, 'subsection')
-            next if owner and owner.at_xpath(".//a:def[@refersTo='##{term_id}']", a: NS)
-            while posn = (text.content =~ /\b#{Regexp::escape(term)}\b/)
-              # <p>A delegation under subsection (1) shall not prevent the <term refersTo="#term-Minister" id="trm357">Minister</term> from exercising the power himself or herself.</p>
-              node = doc.create_element('term', term, refersTo: "##{term_id}", id: "trm#{i}")
-              pre = (posn > 0) ? text.content[0..posn-1] : nil
-              post = text.content[posn+term.length..-1]
-              text.before(node)
-              node.before(doc.create_text_node(pre)) if pre
-              text.content = post
-              i += 1
-            end
-          end
-        end
-      end
-      # recalculate ids for <term> elements
-      def renumber_terms(doc)
-        logger.info("Renumbering terms")
-        doc.xpath('//a:term', a: NS).each_with_index do |term, i|
-          term['id'] = "trm#{i}"
-        end
-      end
       # Adjust blocklists:
       #
       # - nest them correctly