RubyGems - slaw - Versions diffs - 0.1.2 - Mend

slaw 0.1.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (34) hide show

checksums.yaml +7 -0
data/.gitignore +17 -0
data/Gemfile +5 -0
data/LICENSE.txt +22 -0
data/README.md +31 -0
data/Rakefile +7 -0
data/lib/slaw/act.rb +243 -0
data/lib/slaw/bylaw.rb +53 -0
data/lib/slaw/collection.rb +32 -0
data/lib/slaw/elasticsearch.rb +107 -0
data/lib/slaw/lifecycle_event.rb +23 -0
data/lib/slaw/logging.rb +14 -0
data/lib/slaw/namespace.rb +7 -0
data/lib/slaw/parse/blocklists.rb +181 -0
data/lib/slaw/parse/builder.rb +263 -0
data/lib/slaw/parse/bylaw.treetop +259 -0
data/lib/slaw/parse/cleanser.rb +171 -0
data/lib/slaw/parse/error.rb +26 -0
data/lib/slaw/parse/grammar_helpers.rb +11 -0
data/lib/slaw/parse/nodes.rb +371 -0
data/lib/slaw/render/html.rb +53 -0
data/lib/slaw/render/xsl/act.xsl +15 -0
data/lib/slaw/render/xsl/elements.xsl +116 -0
data/lib/slaw/render/xsl/fragment.xsl +16 -0
data/lib/slaw/version.rb +3 -0
data/lib/slaw/xml_support.rb +77 -0
data/lib/slaw.rb +24 -0
data/slaw.gemspec +30 -0
data/spec/parse/builder_spec.rb +543 -0
data/spec/parse/bylaw_spec.rb +365 -0
data/spec/parse/cleanser_spec.rb +126 -0
data/spec/spec_helper.rb +1 -0
data/spec/xml_helpers.rb +46 -0
metadata +194 -0

data/lib/slaw/parse/blocklists.rb ADDED Viewed

@@ -0,0 +1,181 @@
+module Slaw
+  module Parse
+    module Blocklists
+      include Slaw::Namespace
+      # Correctly re-nest nested block lists.
+      #
+      # (a)
+      # (b)
+      # (i)
+      # (ii)
+      # (aa)
+      # (bb)
+      # (c)
+      # (d)
+      #
+      # becomes
+      #
+      # (a)
+      # (b)
+      #   (i)
+      #   (ii)
+      #     (aa)
+      #     (bb)
+      # (c)
+      # (d)
+      #
+      def self.nest_blocklists(doc)
+        doc.xpath('//a:blockList', a: NS).each do |blocklist|
+          items = blocklist.xpath('a:item', a: NS)
+          nest_blocklist_items(items.to_a, guess_number_format(items.first), nil, nil)
+        end
+      end
+      # New blocklist nesting, starting with +item+ as its
+      # first element.
+      def self.nest_blocklist_items(items, our_number_format, list, prev)
+        return if items.empty?
+        item = items.shift
+        sublist_count = 0
+        while item and item.name == 'item'
+          number_format = guess_number_format(item, number_format)
+          break unless number_format
+          if number_format != our_number_format
+            # new sublist, or back to the old list?
+            if number_format < our_number_format
+              # back to the old list
+              items.unshift(item)
+              break
+            else
+              # new sublist.
+              #
+              # The blockList is inserted as a child of the sibling just before
+              # +item+, and that sibling's content is moved into the
+              # +listIntroduction+ of the new list.
+              sublist = item.document.create_element('blockList', id: prev['id'] + ".list#{sublist_count}")
+              sublist_count += 1
+              # list intro
+              num = prev.at_xpath('a:num', a: NS)
+              if intro = num.next_element
+                intro.name = 'listIntroduction'
+                sublist << intro
+              end
+              # make +item+ the first in this list
+              item['id'] = sublist['id'] + ".#{item.num.gsub(/[()]/, '')}"
+              sublist << item
+              # insert this list as a child of the previous item
+              prev << sublist
+              # now keep walking item's (old) siblings
+              # and pull in those elements that match our numbering
+              # scheme
+              nest_blocklist_items(items, number_format, sublist, item)
+            end
+          else
+            # same number format
+            # if this num is (i), we're numbering in :i, this isn't the first
+            # element in this list, then assume we're following (h) with (i)
+            if number_format.type == :i && item.num == "(i)" && prev
+              items.unshift(item)
+              break
+            else
+              # keep it with this list
+              if list
+                list << item
+                item['id'] = list['id'] + ".#{item.num.gsub(/[()]/, '')}"
+              end
+            end
+          end
+          prev = item
+          item = items.shift
+        end
+      end
+      def self.guess_number_format(item, prev_format=nil)
+        return nil unless item.num
+        prev = item.previous_element
+        nxt  = item.next_element
+        case item.num
+        when "(i)"
+          # Special case to detect difference between:
+          #
+          # (h) foo
+          # (i) bar
+          # (j) baz
+          #
+          # and
+          #
+          # (h) foo
+          #   (i)  bar
+          #   (ii) baz
+          #
+          # (i) is NOT a sublist if:
+          #   - there was a previous item (h), and
+          #     - there is not a next item, or
+          #     - the next item is something other than (ii)
+          if prev and prev.num =~ /^\(h/ and (!nxt or nxt.num != "(ii)")
+            NumberingFormat.a
+          else
+            NumberingFormat.i
+          end
+        when "(u)", "(v)", "(x)"
+          prev_format
+        when /^\([ivx]+/
+          NumberingFormat.i
+        when /^\([a-z][a-z]/
+          NumberingFormat.aa
+        when /^\([a-z]+/i
+          NumberingFormat.a
+        when /^\d+(\.\d+)+$/
+          NumberingFormat.new(:'i.i', item.num.count('.'))
+        else
+          NumberingFormat.unknown
+        end
+      end
+      class NumberingFormat
+        include Comparable
+        attr_accessor :type, :ordinal
+        def initialize(type, ordinal)
+          @type = type
+          @ordinal = ordinal
+        end
+        def eql?(other)
+          self.ordinal == other.ordinal
+        end
+        def <=>(other)
+          self.ordinal <=> other.ordinal
+        end
+        def to_s
+          @type.to_s
+        end
+        @@a = NumberingFormat.new(:a, 0)
+        @@i = NumberingFormat.new(:i, 1)
+        @@aa = NumberingFormat.new(:aa, 2)
+        @@unknown = NumberingFormat.new(:unknown, 3)
+        def self.a; @@a; end
+        def self.i; @@i; end
+        def self.aa; @@aa; end
+        def self.unknown; @@unknown; end
+      end
+    end
+  end
+end

data/lib/slaw/parse/builder.rb ADDED Viewed

@@ -0,0 +1,263 @@
+require 'builder'
+require 'treetop'
+module Slaw
+  module Parse
+    # Primary class for building Akoma Ntoso documents.
+    #
+    # It can convert from plain text a new Akoma Ntoso document, or
+    # update existing documents.
+    class Builder
+      include Slaw::Namespace
+      include Slaw::Logging
+      Treetop.load(File.dirname(__FILE__) + "/bylaw.treetop")
+      attr_accessor :parse_options
+      def initialize()
+        @parse_options = {}
+      end
+      # Try to parse plain text into a syntax tree
+      def text_to_syntax_tree(text, root=:bylaw)
+        parser = Slaw::Parse::BylawParser.new
+        parser.options = @parse_options
+        tree = parser.parse(text, {root: root})
+        if tree.nil?
+          raise Slaw::Parse::ParseError.new(parser.failure_reason || "Couldn't match to grammar",
+                                            line: parser.failure_line || 0,
+                                            column: parser.failure_column || 0)
+        end
+        tree
+      end
+      # Generate an XML document from the given syntax tree.
+      def xml_from_syntax_tree(tree)
+        s = ""
+        builder = ::Builder::XmlMarkup.new(indent: 2, target: s)
+        builder.instruct! :xml, :version=>"1.0", :encoding=>"UTF-8"
+        builder.akomaNtoso("xmlns:xsi"=> "http://www.w3.org/2001/XMLSchema-instance",
+                           "xsi:schemaLocation" => "http://www.akomantoso.org/2.0 akomantoso20.xsd",
+                           "xmlns" => NS) { |b|
+          tree.to_xml(b)
+        }
+        s
+      end
+      def parse_xml(xml)
+        Nokogiri::XML(xml, &:noblanks)
+      end
+      def to_xml(doc)
+        doc.to_xml(indent: 2)
+      end
+      # Run various postprocesses on the XML, and return
+      # the updated XML.
+      def postprocess(doc)
+        normalise_headings(doc)
+        find_short_title(doc)
+        sanitise(doc)
+      end
+      # Do sanitisations, such as finding and linking definitions
+      def sanitise(doc)
+        link_definitions(doc)
+        nest_blocklists(doc)
+      end
+      # recalculate ids for <term> elements
+      def renumber_terms(doc)
+        logger.info("Renumbering terms")
+        doc.xpath('//a:term', a: NS).each_with_index do |term, i|
+          term['id'] = "trm#{i}"
+        end
+      end
+      # Change CAPCASE headings into Sentence case.
+      def normalise_headings(doc)
+        logger.info("Normalising headings")
+        nodes = doc.xpath('//a:body//a:heading/text()', a: NS) +
+                doc.xpath('//a:component/a:doc[@name="schedules"]//a:heading/text()', a: NS)
+        nodes.each do |heading|
+          heading.content = heading.content.downcase.gsub(/^\w/) { $&.upcase }
+        end
+      end
+      # Find the short title and add it as an FRBRalias element in the meta section
+      def find_short_title(doc)
+        logger.info("Finding short title")
+        # Short title and commencement
+        # 8. This Act shall be called the Legal Aid Amendment Act, 1996, and shall come
+        # into operation on a date fixed by the President by proclamation in the Gazette.
+        doc.xpath('//a:body//a:heading[contains(text(), "hort title")]', a: NS).each do |heading|
+          section = heading.parent.at_xpath('a:subsection', a: NS)
+          if section and section.text =~ /this act (is|shall be called) the (([a-zA-Z\(\)]\s*)+, \d\d\d\d)/i
+            short_title = $2
+            logger.info("+ Found title: #{short_title}")
+            node = doc.at_xpath('//a:meta//a:FRBRalias', a: NS)
+            node['value'] = short_title
+            break
+          end
+        end
+      end
+      # Find definitions of terms and introduce them into the
+      # meta section of the document.
+      def link_definitions(doc)
+        logger.info("Finding and linking definitions")
+        terms = find_definitions(doc)
+        add_terms_to_references(doc, terms)
+        find_term_references(doc, terms)
+        renumber_terms(doc)
+      end
+      def find_definitions(doc)
+        guess_at_definitions(doc)
+        terms = {}
+        doc.xpath('//a:def', a: NS).each do |defn|
+          # <p>"<def refersTo="#term-affected_land">affected land</def>" means land in respect of which an application has been lodged in terms of section 17(1);</p>
+          id = defn['refersTo'].sub(/^#/, '')
+          term = defn.content
+          terms[id] = term
+          logger.info("+ Found definition for: #{term}")
+        end
+        terms
+      end
+      def guess_at_definitions(doc)
+        doc.xpath('//a:section', a: NS).select do |section|
+          # sections with headings like Definitions
+          heading = section.at_xpath('a:heading', a: NS)
+          heading && heading.content =~ /definitions|interpretation/i
+        end.each do |section|
+          # find items like "foo" means blah...
+          section.xpath('.//a:p|.//a:listIntroduction', a: NS).each do |container|
+            # only if we don't already have a definition here
+            next if container.at_xpath('a:def', a: NS)
+            # get first text node
+            text = container.children.first
+            next if (not text or not text.text?)
+            match = /^\s*["“”](.+?)["“”]/.match(text.text)
+            if match
+              term = match.captures[0]
+              term_id = 'term-' + term.gsub(/[^a-zA-Z0-9_-]/, '_')
+              # <p>"<def refersTo="#term-affected_land">affected land</def>" means land in respect of which an application has been lodged in terms of section 17(1);</p>
+              defn = doc.create_element('def', term, refersTo: "##{term_id}")
+              rest = match.post_match
+              text.before(defn)
+              defn.before(doc.create_text_node('"'))
+              text.content = '"' + rest
+              # adjust the container's id
+              parent = find_up(container, ['blockList', 'point']) || find_up(container, ['subsection', 'section'])
+              parent['id'] = "def-#{term_id}"
+            end
+          end
+        end
+      end
+      def add_terms_to_references(doc, terms)
+        refs = doc.at_xpath('//a:meta/a:references', a: NS)
+        unless refs
+          refs = doc.create_element('references', source: "#this")
+          doc.at_xpath('//a:meta/a:identification', a: NS).after(refs)
+        end
+        # nuke all existing term reference elements
+        refs.xpath('a:TLCTerm', a: NS).each { |el| el.remove }
+        for id, term in terms
+          # <TLCTerm id="term-applicant" href="/ontology/term/this.eng.applicant" showAs="Applicant"/>
+          refs << doc.create_element('TLCTerm',
+                                     id: id,
+                                     href: "/ontology/term/this.eng.#{id.gsub(/^term-/, '')}",
+                                     showAs: term)
+        end
+      end
+      # Find and decorate references to terms in the document.
+      # The +terms+ param is a hash from term_id to actual term.
+      def find_term_references(doc, terms)
+        logger.info("+ Finding references to terms")
+        i = 0
+        # sort terms by the length of the defined term, desc,
+        # so that we don't find short terms inside longer
+        # terms
+        terms = terms.to_a.sort_by { |pair| -pair[1].size }
+        # look for each term
+        for term_id, term in terms
+          doc.xpath('//a:body//text()', a: NS).each do |text|
+            # replace all occurrences in this text node
+            # unless we're already inside a def or term element
+            next if (["def", "term"].include?(text.parent.name))
+            # don't link to a term inside its own definition
+            owner = find_up(text, 'subsection')
+            next if owner and owner.at_xpath(".//a:def[@refersTo='##{term_id}']", a: NS)
+            while posn = (text.content =~ /\b#{Regexp::escape(term)}\b/)
+              # <p>A delegation under subsection (1) shall not prevent the <term refersTo="#term-Minister" id="trm357">Minister</term> from exercising the power himself or herself.</p>
+              node = doc.create_element('term', term, refersTo: "##{term_id}", id: "trm#{i}")
+              pre = (posn > 0) ? text.content[0..posn-1] : nil
+              post = text.content[posn+term.length..-1]
+              text.before(node)
+              node.before(doc.create_text_node(pre)) if pre
+              text.content = post
+              i += 1
+            end
+          end
+        end
+      end
+      def nest_blocklists(doc)
+        logger.info("Nesting blocklists")
+        Slaw::Parse::Blocklists.nest_blocklists(doc)
+      end
+      protected
+      # Look up the parent chain for an element that matches the given
+      # node name
+      def find_up(node, names)
+        names = Array(names)
+        for parent in node.ancestors
+          return parent if names.include?(parent.name)
+        end
+        nil
+      end
+    end
+  end
+end

data/lib/slaw/parse/bylaw.treetop ADDED Viewed

@@ -0,0 +1,259 @@
+require 'slaw/parse/grammar_helpers'
+module Slaw
+  module Parse
+    grammar Bylaw
+      include Slaw::Parse::GrammarHelpers
+      ########
+      # major containers
+      rule bylaw
+        preamble
+        chapters:chapter*
+        schedules:schedules <Bylaw>
+      end
+      rule preamble
+        empty_line*
+        statements:naked_statement* <Preamble>
+      end
+      rule chapter
+        heading:chapter_heading?
+        parts:part+ <Chapter>
+      end
+      rule part
+        heading:part_heading?
+        sections:section+ <Part>
+      end
+      rule section
+        section_title
+        subsections:subsection* <Section>
+      end
+      rule subsection
+        statement:(numbered_statement / naked_statement)
+        blocklist:blocklist? <Subsection>
+      end
+      rule schedules
+        schedules:schedule* <ScheduleContainer>
+      end
+      rule schedule
+        schedule_heading
+        statements:schedule_statement* <Schedule>
+      end
+      ##########
+      # headings
+      rule chapter_heading
+        space? chapter_heading_prefix heading:(whitespace content)? eol
+        <ChapterHeading>
+      end
+      rule part_heading
+        space? part_heading_prefix eol? space? content eol
+        <PartHeading>
+      end
+      rule section_title
+        section_title_1 / section_1_title
+      end
+      rule section_title_1
+        &{ |s| options[:section_number_after_title] }
+        # Section title
+        # 1. Section content
+        content eol
+        section_title_prefix whitespace <SectionTitleType1>
+      end
+      rule section_1_title
+        # 1. Section title
+        # Section content
+        #
+        # Additionally, the section title is optional.
+        !{ |s| options[:section_number_after_title] }
+        section_title_prefix section_title:section_title_content? eol?
+        <SectionTitleType2>
+      end
+      rule section_title_content
+        space !numbered_statement_prefix content eol
+        # if a section title ends in a non-character or it's really long, it's probably section content
+        !{ |s| s[2].text_value[-1] =~ /[^a-zA-z]/ or s[2].text_value.length > 100 }
+      end
+      rule schedule_heading
+        space? schedule_heading_prefix
+        space? "\""? num:alphanums? "\""? space?
+        eol
+        schedule_title:schedule_title_content?
+      end
+      rule schedule_title_content
+        space? content eol
+        # if a schedule title ends in a non-character or it's really long, it's probably content
+        !{ |s| s[1].text_value[-1] =~ /[^a-zA-z]/ or s[1].text_value.length > 100 }
+      end
+      ##########
+      # statements
+      rule numbered_statement
+        space? numbered_statement_prefix whitespace? (!blocklist_item_prefix content eol)? <NumberedStatement>
+      end
+      rule naked_statement
+        space? !(part / chapter / section / schedule) content eol
+        <NakedStatement>
+      end
+      rule schedule_statement
+        space? (!schedule_heading) content eol
+      end
+      ##########
+      # prefixes
+      rule part_heading_prefix
+        'part'i space alphanums ':'?
+      end
+      rule chapter_heading_prefix
+        'chapter'i space alphanums
+      end
+      rule schedule_heading_prefix
+        'schedule'i 's'i?
+      end
+      rule section_title_prefix
+        number_letter '.'?
+      end
+      rule numbered_statement_prefix
+        # there are two subsection handling syntaxes:
+        #
+        # (1) foo
+        # (2A) foo
+        #
+        # and
+        #
+        # 8.2 for
+        # 8.3 bar
+        #
+        # The second is less common, but this allows us to handle it.
+        # Note that it is usually accompanied by a similar list number format:
+        #
+        # 8.2.1 item 1
+        # 8.2.2 item 2
+        #
+        # which aren't subsections, but lists, so force the space at the end
+        # of the number to catch this case.
+        num:('(' number_letter ')')
+        /
+        num:dotted_number_2 '.'? space
+      end
+      ##########
+      # blocklists
+      rule blocklist
+        blocklist_item+ <Blocklist>
+      end
+      rule blocklist_item
+        space? blocklist_item_prefix whitespace? item_content:(!blocklist_item_prefix content eol)?
+        <BlocklistItem>
+      end
+      rule blocklist_item_prefix
+        ('(' letter_ordinal ')') / dotted_number_3
+      end
+      rule letter_ordinal
+        letter (letter / digit)*
+      end
+      #########
+      ## one line of basic content
+      rule content
+        # anything but a newline, followed by a
+        # newline or end of file (without consuming the newline)
+        [^\n]+ &eol
+      end
+      ##########
+      # terminals
+      # eg. 2, 2A, 2b
+      rule number_letter
+        number letter*
+      end
+      rule dotted_number_3
+        number '.' number ('.' number)+
+      end
+      rule dotted_number_2
+        number '.' number
+      end
+      rule number
+        digit+
+      end
+      rule digit
+        [0-9]
+      end
+      rule letter
+        [a-zA-Z]
+      end
+      rule alphanums
+        [a-zA-Z0-9]+
+      end
+      rule quotes
+        ["“”]
+      end
+      rule non_quotes
+        [^"“”]
+      end
+      ##########
+      # whitespace
+      rule space
+        [ \t]+
+      end
+      rule whitespace
+        [ \t\n]*
+      end
+      rule empty_line
+        space? newline
+      end
+      rule eol
+        newline
+        empty_line*
+      end
+      rule newline
+        "\n"
+      end
+    end
+  end
+end