RubyGems - slaw - Versions diffs - 0.1.2 - Mend

slaw 0.1.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (34) hide show

checksums.yaml +7 -0
data/.gitignore +17 -0
data/Gemfile +5 -0
data/LICENSE.txt +22 -0
data/README.md +31 -0
data/Rakefile +7 -0
data/lib/slaw/act.rb +243 -0
data/lib/slaw/bylaw.rb +53 -0
data/lib/slaw/collection.rb +32 -0
data/lib/slaw/elasticsearch.rb +107 -0
data/lib/slaw/lifecycle_event.rb +23 -0
data/lib/slaw/logging.rb +14 -0
data/lib/slaw/namespace.rb +7 -0
data/lib/slaw/parse/blocklists.rb +181 -0
data/lib/slaw/parse/builder.rb +263 -0
data/lib/slaw/parse/bylaw.treetop +259 -0
data/lib/slaw/parse/cleanser.rb +171 -0
data/lib/slaw/parse/error.rb +26 -0
data/lib/slaw/parse/grammar_helpers.rb +11 -0
data/lib/slaw/parse/nodes.rb +371 -0
data/lib/slaw/render/html.rb +53 -0
data/lib/slaw/render/xsl/act.xsl +15 -0
data/lib/slaw/render/xsl/elements.xsl +116 -0
data/lib/slaw/render/xsl/fragment.xsl +16 -0
data/lib/slaw/version.rb +3 -0
data/lib/slaw/xml_support.rb +77 -0
data/lib/slaw.rb +24 -0
data/slaw.gemspec +30 -0
data/spec/parse/builder_spec.rb +543 -0
data/spec/parse/bylaw_spec.rb +365 -0
data/spec/parse/cleanser_spec.rb +126 -0
data/spec/spec_helper.rb +1 -0
data/spec/xml_helpers.rb +46 -0
metadata +194 -0

data/lib/slaw/parse/cleanser.rb ADDED Viewed

@@ -0,0 +1,171 @@
+# encoding: utf-8
+module Slaw
+  module Parse
+    # Helper class to run various cleanup routines on plain text.
+    #
+    # Some of these routines can safely be run multiple times,
+    # others are meant to be run only once.
+    class Cleanser
+      # Run general cleanup, such as stripping bad chars and
+      # removing unnecessary whitespace. This is idempotent
+      # and safe to run multiple times.
+      def cleanup(s)
+        s = scrub(s)
+        s = correct_newlines(s)
+        s = fix_quotes(s)
+        s = expand_tabs(s)
+        s = chomp(s)
+        s = enforce_newline(s)
+        s = remove_boilerplate(s)
+      end
+      # Run deeper introspections and reformat the text, such as
+      # unwrapping/re-wrapping lines. These may not be safe to run
+      # multiple times.
+      def reformat(s)
+        s = unbreak_lines(s)
+        s = break_lines(s)
+        s = strip_toc(s)
+        s = enforce_newline(s)
+      end
+      # ------------------------------------------------------------------------
+      def remove_empty_lines(s)
+        s.gsub(/\n\s*$/, '')
+      end
+      # line endings
+      def correct_newlines(s)
+        s.gsub(/\r\n/, "\n")\
+         .gsub(/\r/, "\n")
+      end
+      # strip invalid bytes and ones we don't like
+      def scrub(s)
+        # we often get this unicode codepoint in the string, nuke it
+        s.gsub([65532].pack('U*'), '')\
+         .gsub("", '')
+      end
+      def fix_quotes(s)
+        # change weird quotes to normal ones
+        s.gsub(/‘‘|’’|''/, '"')
+      end
+      def expand_tabs(s)
+        # tabs to spaces
+        s.gsub(/\t/, ' ')
+      end
+      def remove_boilerplate(s)
+        # nuke any line to do with Sabinet and the government printer
+        s.gsub(/^.*Sabinet.*Government Printer.*$/i, '')\
+         .gsub(/^.*Provincial Gazette \d+.*$/i, '')\
+         .gsub(/^.*Provinsiale Koerant \d+.*$/i, '')\
+         .gsub(/^\s*\d+\s*$/, '')\
+        # get rid of date lines
+         .gsub(/^\d+\s+\w+\s+\d+$/, '')\
+        # get rid of page number lines
+         .gsub(/^\s*page \d+( of \d+)?\s*\n/i, '')
+      end
+      def chomp(s)
+        # trailing whitespace at end of lines
+        s = s.gsub(/ +$/, '')
+        # whitespace on either side
+        s.strip
+      end
+      def enforce_newline(s)
+        # ensure string ends with a newline
+        s.end_with?("\n") ? s : (s + "\n")
+      end
+      # make educated guesses about lines that should
+      # have been broken but haven't, and break them
+      def break_lines(s)
+        # often we find a section title munged onto the same line as its first statement
+        # eg:
+        # foo bar. New section title 62. (1) For the purpose
+        s = s.gsub(/\. ([^.]+) (\d+\. \(1\) )/, ".\n" + '\1' + "\n" + '\2')
+        # New section title 62. (1) For the purpose
+        s = s.gsub(/(\w) (\d+\. \(1\) )/, '\1' + "\n" + '\2')
+        # (1) foo; (2) bar
+        # (1) foo. (2) bar
+        s = s.gsub(/(\w{3,}[;.]) (\([0-9a-z]+\))/, "\\1\n\\2")
+        # (1) foo; and (2) bar
+        # (1) foo; or (2) bar
+        s = s.gsub(/; (and|or) \(/, "; \\1\n(")
+        # The officer-in-Charge may – (a) remove all withered natural... \n(b)
+        # We do this last, because by now we should have reconised that (b) should already
+        # be on a new line.
+        s = s.gsub(/ (\(a\) .+?\n\(b\))/, "\n\\1")
+        # "foo" means ...; "bar" means
+        s = s.gsub(/; (["”“][^"”“]+?["”“] means)/, ";\n\\1")
+        s
+      end
+      # finds likely candidates for unnecessarily broken lines
+      # and  them
+      def unbreak_lines(s)
+        lines = s.split(/\n/)
+        output = []
+        start_re = /^\s*[a-z]/
+        end_re   = /[a-z0-9]\s*$/
+        prev = nil
+        lines.each_with_index do |line, i|
+          if i == 0
+            output << line
+          else
+            prev = output[-1]
+            if line =~ start_re and prev =~ end_re
+              output[-1] = prev + ' ' + line
+            else
+              output << line
+            end
+          end
+        end
+        output.join("\n")
+      end
+      # do our best to remove table of contents at the start,
+      # it really confuses the grammer
+      def strip_toc(s)
+        # first, try to find 'TABLE OF CONTENTS' anywhere within the first 4K of text,
+        if toc_start = s[0..4096].match(/TABLE OF CONTENTS/i)
+          # grab the first non-blank line after that, it's our end-of-TOC marker
+          if eol = s.match(/^(.+?)$/, toc_start.end(0))
+            marker = eol[0]
+            # search for the first line that is a prefix of marker (or vv), and delete
+            # everything in between
+            posn = eol.end(0)
+            while m = s.match(/^(.+?)$/, posn)
+              if marker.start_with?(m[0]) or m[0].start_with?(marker)
+                return s[0...toc_start.begin(0)] + s[m.begin(0)..-1]
+              end
+              posn = m.end(0)
+            end
+          end
+        end
+        s
+      end
+    end
+  end
+end

data/lib/slaw/parse/error.rb ADDED Viewed

@@ -0,0 +1,26 @@
+module Slaw
+  module Parse
+    class ParseError < Exception
+      attr_accessor :line, :column
+      def initialize(message, opts)
+        super(message)
+        self.line = opts[:line]
+        self.column = opts[:column]
+      end
+      # TODO: move this elsewhere, it's out of context here
+      def to_json(g=nil)
+        msg = self.message
+        msg = msg[0..200] + '...' if msg.length > 200
+        {
+          message: msg,
+          line: self.line,
+          column: self.column,
+        }.to_json(g)
+      end
+    end
+  end
+end

data/lib/slaw/parse/grammar_helpers.rb ADDED Viewed

@@ -0,0 +1,11 @@
+module Slaw
+  module Parse
+    module GrammarHelpers
+      attr_writer :options
+      def options
+        @options ||= {}
+      end
+    end
+  end
+end

data/lib/slaw/parse/nodes.rb ADDED Viewed

@@ -0,0 +1,371 @@
+module Slaw
+  module Parse
+    module Bylaw
+      class Bylaw < Treetop::Runtime::SyntaxNode
+        def to_xml(b)
+          b.act(contains: "originalVersion") { |b|
+            b.meta { |b|
+              b.identification(source: "#openbylaws") { |b|
+                # TODO: correct values
+                b.FRBRWork { |b|
+                  b.FRBRthis(value: '/za/by-law/locale/1980/name/main')
+                  b.FRBRuri(value: '/za/by-law/locale/1980/name')
+                  b.FRBRalias(value: 'By-Law Short Title')
+                  b.FRBRdate(date: '1980-01-01', name: 'Generation')
+                  b.FRBRauthor(href: '#council', as: '#author')
+                  b.FRBRcountry(value: 'za')
+                }
+                b.FRBRExpression { |b|
+                  b.FRBRthis(value: '/za/by-law/locale/1980/name/main/eng@')
+                  b.FRBRuri(value: '/za/by-law/locale/1980/name/eng@')
+                  b.FRBRdate(date: '1980-01-01', name: 'Generation')
+                  b.FRBRauthor(href: '#council', as: '#author')
+                  b.FRBRlanguage(language: 'eng')
+                }
+                b.FRBRManifestation { |b|
+                  b.FRBRthis(value: '/za/by-law/locale/1980/name/main/eng@')
+                  b.FRBRuri(value: '/za/by-law/locale/1980/name/eng@')
+                  b.FRBRdate(date: Time.now.strftime('%Y-%m-%d'), name: 'Generation')
+                  b.FRBRauthor(href: '#openbylaws', as: '#author')
+                }
+              }
+              b.publication(date: '1980-01-01',
+                            name: 'Province of Western Cape: Provincial Gazette',
+                            number: 'XXXX',
+                            showAs: 'Province of Western Cape: Provincial Gazette')
+              b.references(source: "#this") {
+                b.TLCOrganization(id: 'openbylaws', href: 'http://openbylaws.org.za', showAs: "openbylaws.org.za")
+                b.TLCOrganization(id: 'council', href: '/ontology/organization/za/council.cape-town', showAs: "Cape Town City Council")
+                b.TLCRole(id: 'author', href: '/ontology/role/author', showAs: 'Author')
+              }
+            }
+            if preamble.text_value != ""
+              b.preamble { |b|
+                preamble.to_xml(b)
+              }
+            end
+            b.body { |b|
+              chapters.elements.each { |e| e.to_xml(b) }
+            }
+          }
+          schedules.to_xml(b)
+        end
+      end
+      class Preamble < Treetop::Runtime::SyntaxNode
+        def to_xml(b)
+          statements.elements.each { |e|
+            if not (e.content.text_value =~ /^preamble/i)
+              b.p(e.content.text_value)
+            end
+          }
+        end
+      end
+      class Part < Treetop::Runtime::SyntaxNode
+        def num
+          heading.empty? ? nil : heading.num
+        end
+        def to_xml(b)
+          # do we have a part heading?
+          if not heading.empty?
+            id = "part-#{num}"
+            # include a chapter number in the id if our parent has one
+            if parent and parent.parent.is_a?(Chapter) and parent.parent.num
+              id = "chapter-#{parent.parent.num}.#{id}"
+            end
+            b.part(id: id) { |b|
+              heading.to_xml(b)
+              sections.elements.each { |e| e.to_xml(b) }
+            }
+          else
+            # no parts
+            sections.elements.each { |e| e.to_xml(b) }
+          end
+        end
+      end
+      class PartHeading < Treetop::Runtime::SyntaxNode
+        def num
+          part_heading_prefix.alphanums.text_value
+        end
+        def title
+          content.text_value
+        end
+        def to_xml(b)
+          b.num(num)
+          b.heading(title)
+        end
+      end
+      class Chapter < Treetop::Runtime::SyntaxNode
+        def num
+          heading.empty? ? nil : heading.num
+        end
+        def to_xml(b)
+          # do we have a chapter heading?
+          if not heading.empty?
+            id = "chapter-#{num}"
+            # include a part number in the id if our parent has one
+            if parent and parent.parent.is_a?(Part) and parent.parent.num
+              id = "part-#{parent.parent.num}.#{id}"
+            end
+            b.chapter(id: id) { |b|
+              heading.to_xml(b)
+              parts.elements.each { |e| e.to_xml(b) }
+            }
+          else
+            # no chapters
+            parts.elements.each { |e| e.to_xml(b) }
+          end
+        end
+      end
+      class ChapterHeading < Treetop::Runtime::SyntaxNode
+        def num
+          chapter_heading_prefix.alphanums.text_value
+        end
+        def title
+          if self.respond_to? :heading
+            heading.content.text_value
+          elsif self.respond_to? :content
+            content.text_value
+          end
+        end
+        def to_xml(b)
+          b.num(num)
+          b.heading(title) if title
+        end
+      end
+      class Section < Treetop::Runtime::SyntaxNode
+        def num
+          section_title.num
+        end
+        def title
+          section_title.title
+        end
+        def to_xml(b)
+          id = "section-#{num}"
+          b.section(id: id) { |b|
+            b.num("#{num}.")
+            b.heading(title)
+            idprefix = "#{id}."
+            subsections.elements.each_with_index { |e, i| e.to_xml(b, i, idprefix) }
+          }
+        end
+      end
+      class SectionTitleType1 < Treetop::Runtime::SyntaxNode
+        # a section title of the form:
+        #
+        # Definitions
+        # 1. In this by-law...
+        def num
+          section_title_prefix.number_letter.text_value
+        end
+        def title
+          content.text_value
+        end
+      end
+      class SectionTitleType2 < Treetop::Runtime::SyntaxNode
+        # a section title of the form:
+        #
+        # 1. Definitions
+        # In this by-law...
+        #
+        # In this format, the title is optional and the section content may
+        # start where we think the title is.
+        def num
+          section_title_prefix.number_letter.text_value
+        end
+        def title
+          section_title.empty? ? "" : section_title.content.text_value
+        end
+      end
+      class Subsection < Treetop::Runtime::SyntaxNode
+        def to_xml(b, i, idprefix)
+          if statement.is_a?(NumberedStatement)
+            attribs = {id: idprefix + statement.num.gsub(/[()]/, '')}
+          else
+            attribs = {id: idprefix + "subsection-#{i}"}
+          end
+          idprefix = attribs[:id] + "."
+          b.subsection(attribs) { |b|
+            b.num(statement.num) if statement.is_a?(NumberedStatement)
+            b.content { |b|
+              if blocklist and blocklist.is_a?(Blocklist)
+                if statement.content
+                  blocklist.to_xml(b, i, idprefix) { |b| b << statement.content.text_value }
+                else
+                  blocklist.to_xml(b, i, idprefix)
+                end
+              else
+                # raw content
+                b.p(statement.content.text_value) if statement.content
+              end
+            }
+          }
+        end
+      end
+      class NumberedStatement < Treetop::Runtime::SyntaxNode
+        def num
+          numbered_statement_prefix.num.text_value
+        end
+        def parentheses?
+          !numbered_statement_prefix.respond_to? :dotted_number_2
+        end
+        def content
+          if elements[3].text_value == ""
+            nil
+          else
+            elements[3].content
+          end
+        end
+      end
+      class NakedStatement < Treetop::Runtime::SyntaxNode
+      end
+      class Blocklist < Treetop::Runtime::SyntaxNode
+        # Render a block list to xml. If a block is given,
+        # yield to it a builder to insert a listIntroduction node
+        def to_xml(b, i, idprefix, &block)
+          id = idprefix + "list#{i}"
+          idprefix = id + '.'
+          b.blockList(id: id) { |b|
+            b.listIntroduction { |b| yield b } if block_given?
+            elements.each { |e| e.to_xml(b, idprefix) }
+          }
+        end
+      end
+      class BlocklistItem < Treetop::Runtime::SyntaxNode
+        def num
+          blocklist_item_prefix.text_value
+        end
+        def content
+          # TODO this really seems a bit odd
+          item_content.content.text_value if respond_to? :item_content and item_content.respond_to? :content
+        end
+        def to_xml(b, idprefix)
+          b.item(id: idprefix + num.gsub(/[()]/, '')) { |b|
+            b.num(num)
+            b.p(content) if content
+          }
+        end
+      end
+      class ScheduleContainer < Treetop::Runtime::SyntaxNode
+        def to_xml(b)
+          return if schedules.elements.empty?
+          b.components { |b|
+            b.component(id: 'component-0') { |b|
+              b.doc(name: 'schedules') { |b|
+                b.meta { |b|
+                  b.identification(source: "#openbylaws") { |b|
+                    b.FRBRWork { |b|
+                      b.FRBRthis(value: '/za/by-law/locale/1980/name/main/schedules')
+                      b.FRBRuri(value: '/za/by-law/locale/1980/name/schedules')
+                      b.FRBRdate(date: '1980-01-01', name: 'Generation')
+                      b.FRBRauthor(href: '#council', as: '#author')
+                      b.FRBRcountry(value: 'za')
+                    }
+                    b.FRBRExpression { |b|
+                      b.FRBRthis(value: '/za/by-law/locale/1980/name/main//schedules/eng@')
+                      b.FRBRuri(value: '/za/by-law/locale/1980/name/schedules/eng@')
+                      b.FRBRdate(date: '1980-01-01', name: 'Generation')
+                      b.FRBRauthor(href: '#council', as: '#author')
+                      b.FRBRlanguage(language: 'eng')
+                    }
+                    b.FRBRManifestation { |b|
+                      b.FRBRthis(value: '/za/by-law/locale/1980/name/main/schedules/eng@')
+                      b.FRBRuri(value: '/za/by-law/locale/1980/name/schedules/eng@')
+                      b.FRBRdate(date: Time.now.strftime('%Y-%m-%d'), name: 'Generation')
+                      b.FRBRauthor(href: '#openbylaws', as: '#author')
+                    }
+                  }
+                }
+                b.mainBody { |b|
+                  schedules.elements.each_with_index { |e, i| e.to_xml(b, i) }
+                }
+              }
+            }
+          }
+        end
+      end
+      class Schedule < Treetop::Runtime::SyntaxNode
+        def num
+          n = schedule_heading.num.text_value
+          return (n && !n.empty?) ? n : nil
+        end
+        def heading
+          if schedule_heading.schedule_title.respond_to? :content
+            schedule_heading.schedule_title.content.text_value
+          else
+            nil
+          end
+        end
+        def to_xml(b, i)
+          n = num
+          id = if n
+                 "schedule-#{n}"
+               else
+                 "schedules"
+               end
+          b.chapter(id: id) { |b|
+            b.num(num) if num
+            b.heading(heading) if heading
+            b.section(id: id + ".section-0") { |b|
+              b.content { |b|
+                statements.elements.each { |e| b.p(e.content.text_value) }
+              }
+            }
+          }
+        end
+      end
+    end
+  end
+end

data/lib/slaw/render/html.rb ADDED Viewed

@@ -0,0 +1,53 @@
+module Slaw
+  module Render
+    # Support for transforming XML AN documents into HTML.
+    class HTMLRenderer
+      def initialize
+        here = File.dirname(__FILE__)
+        @xslt = {
+          act: Nokogiri::XSLT(File.open(File.join([here, 'xsl/act.xsl']))),
+          fragment: Nokogiri::XSLT(File.open(File.join([here, 'xsl/fragment.xsl']))),
+        }
+      end
+      # Transform an entire XML document +doc+ (a Nokogiri::XML::Document object) into HTML.
+      # Specify +base_url+ to manage the base for relative URLs generated by
+      # the transform.
+      def render(doc, base_url='')
+        params = transform_params({'base_url' => base_url})
+        run_xslt(:act, doc, params)
+      end
+      # Transform just a single node and its children into HTML.
+      #
+      # If +elem+ has an id, we use xpath to tell the XSLT which
+      # element to transform. Otherwise we copy the node into a new
+      # tree and apply the XSLT to that.
+      def render_node(node, base_url='')
+        params = transform_params({'base_url' => base_url})
+        if node.id
+          params += ['root_elem', "//*[@id='#{node.id}']"]
+          doc = node.document
+        else
+          # create a new document with just this element at the root
+          doc = Nokogiri::XML::Document.new
+          doc.root = node
+          params += ['root_elem', '*']
+        end
+        run_xslt(:fragment, doc, params)
+      end
+      def run_xslt(xslt, doc, params)
+        @xslt[xslt].transform(doc, params).to_s
+      end
+      def transform_params(params)
+        Nokogiri::XSLT.quote_params(params)
+      end
+    end
+  end
+end

data/lib/slaw/render/xsl/act.xsl ADDED Viewed

@@ -0,0 +1,15 @@
+<?xml version="1.0"?>
+<xsl:stylesheet xmlns:xsl="http://www.w3.org/1999/XSL/Transform" version="1.0"
+  xmlns:a="http://www.akomantoso.org/2.0"
+  exclude-result-prefixes="a">
+  <xsl:import href="elements.xsl" />
+  <xsl:output method="html" />
+  <xsl:template match="/">
+    <xsl:apply-templates select="a:akomaNtoso/a:act" />
+  </xsl:template>
+</xsl:stylesheet>