RubyGems - slaw - Versions diffs - 0.1.2 → 0.1.3 - Mend

slaw 0.1.2 → 0.1.3

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (25) hide show

checksums.yaml +4 -4
data/.travis.yml +5 -0
data/Gemfile +0 -1
data/README.md +25 -3
data/Rakefile +2 -0
data/lib/slaw.rb +2 -1
data/lib/slaw/act.rb +101 -41
data/lib/slaw/bylaw.rb +16 -23
data/lib/slaw/collection.rb +30 -2
data/lib/slaw/extract/extractor.rb +93 -0
data/lib/slaw/parse/blocklists.rb +25 -17
data/lib/slaw/parse/builder.rb +97 -22
data/lib/slaw/parse/bylaw.treetop +2 -0
data/lib/slaw/parse/cleanser.rb +14 -8
data/lib/slaw/parse/grammar_helpers.rb +4 -0
data/lib/slaw/render/html.rb +25 -8
data/lib/slaw/render/xsl/elements.xsl +1 -1
data/lib/slaw/version.rb +1 -1
data/slaw.gemspec +0 -1
data/spec/extract/extractor_spec.rb +14 -0
data/spec/parse/builder_spec.rb +2 -0
data/spec/parse/bylaw_spec.rb +2 -0
data/spec/parse/cleanser_spec.rb +2 -0
metadata +6 -17
data/lib/slaw/elasticsearch.rb +0 -107

checksums.yaml CHANGED Viewed

@@ -1,7 +1,7 @@
 ---
 SHA1:
-  metadata.gz: a0da8d0d88cfd753f8ef854937248c8881440390
-  data.tar.gz: 0762a47f6b0bac65d4b3fe829bc2a4997f3b23f9
+  metadata.gz: 680301c5ade280eb7da5ea92c509f491631824f2
+  data.tar.gz: f2ddd5a99631121bf3693da5f229e38a6f590142
 SHA512:
-  metadata.gz: 4ab788b276cd06d1735bb859a1f7fa08820f9bf65b2c6282df65ea1fd2303cbd5b42433366a3a0b2a7a20dbe227e78cc6b5caa2ab3b5cb988d6c2a27097f05ce
-  data.tar.gz: 3882e5a3b292dfcd9adecb0b2077f9cb21a5d5e76e90402a09c77f146a1ec3acb0272649daf03cc64556864bfd5d8a921d873cbb20534c626542894a02218372
+  metadata.gz: 844130f24fa5e4e7e2acd8bacc9381bbd043591676a4fd22e9f1deec87e99b813f3062e4c4ec7286aca4ec0fe2a17161c39d85f5a07c8819192c82cd6203e474
+  data.tar.gz: de11ab3cb747c7341209e79f131506f6e2fc44065a73d95bb936c2b36b348646644024b0b657252106cd4c6d9f1b792ca4f7884e8f45fff4bda453da0a736cb7

data/.travis.yml ADDED Viewed

@@ -0,0 +1,5 @@
+language: ruby
+rvm:
+  - 1.9.3
+  - 2.1.0
+  - 2.1.1

data/Gemfile CHANGED Viewed

@@ -1,5 +1,4 @@
 source 'https://rubygems.org'
-ruby '2.1.1'
 # Specify your gem's dependencies in slaw.gemspec
 gemspec

data/README.md CHANGED Viewed

@@ -1,4 +1,4 @@
-# Slaw
+# Slaw [![Build Status](https://travis-ci.org/longhotsummer/slaw.svg)](http://travis-ci.org/longhotsummer/slaw)
 Slaw is a lightweight library for rendering and generating Akoma Ntoso acts from plain text and PDF documents.
 It is used to power [openbylaws.org.za](http://openbylaws.org.za).
@@ -21,11 +21,33 @@ Or install it yourself as:
 TODO: Write usage instructions here
+### Extracting text from PDFs
+You will need [xpdf](http://www.foolabs.com/xpdf/) to run PDF extraction. If you're
+on a Mac you can use
+    brew install xpdf
+Extracting PDFs often break lines in odd places (or doesn't break them when it should). Slaw gets around
+this by running some cleanup routines on the extracted text.
+```ruby
+extractor = Slaw::Extract::Extractor.new
+# to guess the filetype by extension
+text = extractor.extract_from_file('/path/to/file.pdf')
+# or if you know it's a PDF
+text = extractor.extract_from_pdf('/path/to/file.pdf')
+# You can also "extract" text from a plain-text file
+text = extractor.extract_from_text('/path/to/file.txt')
+```
 ## Contributing
-1. Fork it ( http://github.com/longhotsummer/slaw/fork )
+1. Fork it at http://github.com/longhotsummer/slaw/fork
 2. Create your feature branch (`git checkout -b my-new-feature`)
 3. Commit your changes (`git commit -am 'Add some feature'`)
 4. Push to the branch (`git push origin my-new-feature`)
 5. Create new Pull Request

data/Rakefile CHANGED Viewed

@@ -5,3 +5,5 @@ begin
   RSpec::Core::RakeTask.new(:spec)
 rescue LoadError
 end
+task default: [:spec]

data/lib/slaw.rb CHANGED Viewed

@@ -18,7 +18,8 @@ require 'slaw/parse/cleanser'
 require 'slaw/parse/error'
 require 'slaw/parse/grammar_helpers'
 require 'slaw/parse/nodes'
-require 'slaw/elasticsearch'
+require 'slaw/extract/extractor'
 module Slaw
 end

data/lib/slaw/act.rb CHANGED Viewed

@@ -1,5 +1,12 @@
 module Slaw
-  # Wraps an AkomaNtoso 2.0 XML document describing an Act.
+  # An Act wraps a single {http://www.akomantoso.org/ AkomaNtoso 2.0 XML} act document in the form of a
+  # Nokogiri::XML::Document object.
+  #
+  # The Act object provides quick access to certain sections of the document,
+  # such as the metadata and the body, as well as common operations such as
+  # identifying whether it has been amended ({#amended?}), repealed
+  # ({#repealed?}) or what chapters ({#chapters}), parts ({#parts}) and
+  # sections ({#sections}) it contains.
   class Act
     include Slaw::Namespace
@@ -7,19 +14,45 @@ module Slaw
     # Act instance itself
     @@acts = {}
-    attr_accessor :doc, :meta, :body, :num, :year, :id_uri
-    attr_accessor :filename, :mtime
+    # [Nokogiri::XML::Document] The underlying {Nokogiri::XML::Document} instance
+    attr_accessor :doc
+    # [Nokogiri::XML::Node] The `meta` XML node
+    attr_accessor :meta
+    # [Nokogiri::XML::Node] The `body` XML node
+    attr_accessor :body
+    # [String] The year this act was published
+    attr_accessor :year
+    # [String] The act number in the year this act was published
+    attr_accessor :num
+    # [String] The FRBR URI of this act, which uniquely identifies it globally
+    attr_accessor :id_uri
+    # [String, nil] The source filename, or nil
+    attr_accessor :filename
+    # [Time, nil] The mtime of when the source file was last modified
+    attr_accessor :mtime
+    # Get the act that wraps the document that owns this XML node
+    # @param node [Nokogiri::XML::Node]
+    # @return [Act] owning act
     def self.for_node(node)
       @@acts[node.document]
     end
-    # Create a new instance
+    # Create a new instance, loading from `filename` if given.
+    # @param filename [String] filename to load XML from
     def initialize(filename=nil)
       self.load(filename) if filename
     end
-    # Load the XML from +filename+
+    # Load the XML in `filename` into this instance
+    # @param filename [String] filename
     def load(filename)
       @filename = filename
       @mtime = File::mtime(@filename)
@@ -27,7 +60,8 @@ module Slaw
       File.open(filename) { |f| parse(f) }
     end
-    # Parse the XML contained in the file-like object +io+
+    # Parse the XML contained in the file-like object `io`
+    # @param io [file-like] io object with XML
     def parse(io)
       @doc = Nokogiri::XML(io)
       @meta = @doc.at_xpath('/a:akomaNtoso/a:act/a:meta', a: NS)
@@ -35,10 +69,11 @@ module Slaw
       @@acts[@doc] = self
-      extract_id
+      _extract_id
     end
-    def extract_id
+    # Parse the FRBR Uri into its constituent parts
+    def _extract_id
       @id_uri = @meta.at_xpath('./a:identification/a:FRBRWork/a:FRBRuri', a: NS)['value']
       empty, @country, type, date, @num = @id_uri.split('/')
@@ -46,48 +81,39 @@ module Slaw
       @year = date.split('-', 2)[0]
     end
+    # An applicable short title for this act, either from the `FRBRalias` element
+    # or based on the act number and year.
+    # @return [String]
     def short_title
-      unless @short_title
-        node = @meta.at_xpath('./a:identification/a:FRBRWork/a:FRBRalias', a: NS)
-        if node
-          @short_title = node['value']
-        else
-          @short_title = "Act #{num} of #{year}"
-        end
-      end
-      @short_title
-    end
-    def url_path
-      "/#{@country}/acts/#{@year}/#{@num}/"
-    end
-    def url_file
-      "act-#{@year}-#{@num}"
+      node = @meta.at_xpath('./a:identification/a:FRBRWork/a:FRBRalias', a: NS)
+      node ? node['value'] : "Act #{num} of #{year}"
     end
-    # Has this act been amended?
+    # Has this act been amended? This is determined by testing the `contains`
+    # attribute of the `act` root element.
+    #
+    # @return [Boolean]
     def amended?
       @doc.at_xpath('/a:akomaNtoso/a:act', a: NS)['contains'] != 'originalVersion'
     end
-    # a list of LifecycleEvent objects for amendment events, in date order
+    # Get a list of {Slaw::LifecycleEvent} objects for amendment events, in date order.
+    # @return [Array<Slaw::LifecycleEvent>] possibly empty list of lifecycle events
     def amendment_events
       @meta.xpath('./a:lifecycle/a:eventRef[@type="amendment"]', a: NS).map do |event|
         LifecycleEvent.new(event)
       end.sort_by { |e| e.date }
     end
-    # Mark this act as being amended by another act, either +act+
-    # or the details in +opts+:
-    #
-    #   :uri: uri of the amending act
-    #   :title: title of the amending act
-    #   :date: date of the amendment
+    # Mark this act as being amended by another act, either `act`
+    # or the details in `opts`.
     #
     # It is assumed that there can be only one amendment event on a particular
     # date. An existing amendment on this date is overwritten.
+    #
+    # @option opts [String] :uri uri of the amending act
+    # @option opts [String] :title title of the amending act
+    # @option opts [String] :date date of the amendment (YYYY-MM-DD)
     def amended_by!(act, opts={})
       if act
         opts[:uri] ||= act.id_uri
@@ -133,27 +159,40 @@ module Slaw
     end
     # Does this Act have parts?
+    # @return [Boolean]
     def parts?
       !parts.empty?
     end
+    # Top-level parts of this act. Parts inside chapters are ignored.
+    # @return [Array<Nokogiri::XML::Node>] part nodes
     def parts
       @body.xpath('./a:part', a: NS)
     end
+    # Does this Act have chapters?
+    # @return [Boolean]
     def chapters?
       !chapters.empty?
     end
+    # Top-level chapters of this act. Chapters inside parts are ignored.
+    # @return [Array<Nokogiri::XML::Node>] chapter nodes
     def chapters
       @body.xpath('./a:chapter', a: NS)
     end
+    # Sections of this act
+    # @return [Array<Nokogiri::XML::Node>] section nodes
     def sections
       @body.xpath('.//a:section', a: NS)
     end
-    # The XML node representing the definitions section
+    # The primary definitions section of this act, identified by
+    # either an `id` of `definitions` or the first section with a heading
+    # of `Definitions`.
+    #
+    # @return [Nokogiri::XML::Node, nil] definitions node or nil
     def definitions
       # try looking for the definition list
       defn = @body.at_css('#definitions')
@@ -166,14 +205,21 @@ module Slaw
       nil
     end
-    # The XML node representing the schedules document
+    # An act can contain schedules, additional (generally free-form) documents
+    # that are addendums to the the main body. A definition element must be
+    # part of a separate `component` and have a `doc` element with a name attribute
+    # of `schedules`.
+    #
+    # @return [Nokogiri::XML::Node, nil] schedules document node
     def schedules
       @doc.at_xpath('/a:akomaNtoso/a:components/a:component/a:doc[@name="schedules"]/a:mainBody', a: NS)
     end
-    # Get a map from term ids to +[term, defn]+ pairs,
-    # where +term+ is the text term NS+defn+ is
-    # the XML node with the definition in it.
+    # Get a map from term ids to `[term, defn]` pairs,
+    # where `term+ is the plain text term and `defn` is
+    # the {Nokogiri::XML::Node} containing the definition.
+    #
+    # @return {String => List(String, Nokogiri::XML::Node)} map from strings to `[term, definition]` pairs
     def term_definitions
       terms = {}
@@ -191,23 +237,31 @@ module Slaw
     end
     # Returns the publication element, if any.
+    #
+    # @return [Nokogiri::XML::Node, nil]
     def publication
       @meta.at_xpath('./a:publication', a: NS)
     end
     # Has this by-law been repealed?
+    #
+    # @return [Boolean]
     def repealed?
       !!repealed_on
     end
     # The date on which this act was repealed, or nil if never repealed
+    #
+    # @return [String] date of repeal or nil
     def repealed_on
       repeal_el = repeal
       repeal_el ? Time.parse(repeal_el['date']) : nil
     end
     # The element representing the reference that caused the repeal of this
-    # act, or nil
+    # act, or nil.
+    #
+    # @return [Nokogiri::XML::Node] element of reference to repealing act, or nil
     def repealed_by
       repeal_el = repeal
       return nil unless repeal_el
@@ -216,7 +270,9 @@ module Slaw
       @meta.at_xpath("./a:references/a:passiveRef[@id='#{source_id}']", a: NS)
     end
-    # The XML element representing the repeal of this act, or nil
+    # The XML element representing the event of repeal of this act, or nil
+    #
+    # @return [Nokogiri::XML::Node]
     def repeal
       # <lifecycle source="#this">
       #   <eventRef id="e1" date="2010-07-28" source="#original" type="generation"/>
@@ -226,11 +282,15 @@ module Slaw
       @meta.at_xpath('./a:lifecycle/a:eventRef[@type="repeal"]', a: NS)
     end
+    # The date at which this particular XML manifestation of this document was generated.
+    #
+    # @return [String] date, YYYY-MM-DD
     def manifestation_date
       node = @meta.at_xpath('./a:identification/a:FRBRManifestation/a:FRBRdate[@name="Generation"]', a: NS)
       node && node['date']
     end
+    # The underlying nature of this act, usually `act` although subclasses my override this.
     def nature
       "act"
     end

data/lib/slaw/bylaw.rb CHANGED Viewed

@@ -1,12 +1,19 @@
 require 'slaw/act'
 module Slaw
-  # Wraps an AkomaNtoso XML document describing an Act classed as a By-Law
+  # An extension of {Slaw::Act} which wraps an AkomaNtoso XML document describing an By-Law.
+  #
+  # There are minor differences between Acts and By-laws, the most notable being that a by-law
+  # is not identified by a year and a number, and therefore has a different FRBR uri structure.
   class ByLaw < Act
-    attr_accessor :region, :name
+    # [String] The region this by-law applies to
+    attr_accessor :region
+    # [String] A short file-like name of this by-law, unique within its year and region
+    attr_accessor :name
-    def extract_id
+    def _extract_id
       # /za/by-law/cape-town/2010/public-parks
       @id_uri = @meta.at_xpath('./a:identification/a:FRBRWork/a:FRBRuri', a: NS)['value']
@@ -22,30 +29,16 @@ module Slaw
     end
     def short_title
-      unless @short_title
-        node = @meta.at_xpath('./a:identification/a:FRBRWork/a:FRBRalias', a: NS)
-        if node
-          @short_title = node['value']
-        else
-          @short_title = "(Unknown)"
-        end
-        if amended? and not @short_title.end_with?("as amended")
-          @short_title = @short_title + " as amended"
-        end
-      end
+      node = @meta.at_xpath('./a:identification/a:FRBRWork/a:FRBRalias', a: NS)
+      short_title = node ? node['value'] : "(Unknown)"
-      @short_title
-    end
+      if amended? and not short_title.end_with?("as amended")
+        short_title = short_title + " as amended"
+      end
-    def url_path
-      "/#{@country}/by-law/#{@region}/#{@year}/#{@name}/"
+      short_title
     end
-    def url_file
-      @name
-    end
     def nature
       "by-law"
     end

data/lib/slaw/collection.rb CHANGED Viewed

@@ -2,11 +2,28 @@ require 'forwardable'
 module Slaw
   # A collection of Act instances.
+  #
+  # This is useful for looking up acts by their FRBR uri and for
+  # loading a collection of XML act documents.
+  #
+  # This collection is enumerable and can be iterated over. Use {#items} to
+  # access the underlying array of objects.
+  #
+  # @example Load a collection of acts and then iterate over them.
+  #
+  #     acts = Slaw::DocumentCollection.new
+  #     acts.discover('/path/to/acts/')
+  #
+  #     for act in acts
+  #       puts act.short_name
+  #     end
+  #
   class DocumentCollection
     include Enumerable
     extend Forwardable
+    # [Array<Act>] The underlying array of acts
     attr_accessor :items
     def_delegators :items, :each, :<<, :length
@@ -15,16 +32,27 @@ module Slaw
       @items = items || []
     end
-    # Find all XML files in +path+ and return
-    # a list of instances of +cls+.
+    # Find all XML files in `path` and add them into this
+    # collection.
+    #
+    # @param path [String] the path to glob for xml files
+    # @param cls [Class] the class to instantiate for each file
+    #
+    # @return [DocumentCollection] this collection
     def discover(path, cls=Slaw::Act)
       for fname in Dir.glob("#{path}/**/*.xml")
         @items << cls.new(fname)
       end
+      self
     end
     # Try to find an act who's FRBRuri matches this one,
     # returning nil on failure
+    #
+    # @param uri [String] the uri to look for
+    #
+    # @return [Act, nil] the act, or nil
     def for_uri(uri)
       return @items.find { |doc| doc.id_uri == uri }
     end

data/lib/slaw/extract/extractor.rb ADDED Viewed

@@ -0,0 +1,93 @@
+require 'open3'
+module Slaw
+  module Extract
+    # Routines for extracting and cleaning up context from other formats, such as PDF.
+    #
+    # You may need to set the location of the `pdftotext` binary.
+    #
+    # On Mac OS X, use `brew install xpdf` or download from http://www.foolabs.com/xpdf/download.html
+    #
+    # On Heroku, you'll need to do some hoop jumping, see http://theprogrammingbutler.com/blog/archives/2011/07/28/running-pdftotext-on-heroku/
+    class Extractor
+      include Slaw::Logging
+      @@pdftotext_path = "pdftotext"
+      # Object with text cleaning helpers
+      attr_accessor :cleanser
+      def initialize
+        @cleanser = Slaw::Parse::Cleanser.new
+      end
+      # Extract text from a file and run cleanup on it.
+      #
+      # @param filename [String] filename to extract from
+      #
+      # @return [String] extracted text
+      def extract_from_file(filename)
+        ext = filename[-4..-1].downcase
+        case ext
+        when '.pdf'
+          extract_from_pdf(filename)
+        when '.txt'
+          extract_from_text(filename)
+        else
+          raise ArgumentError.new("Unsupported file type #{ext}")
+        end
+      end
+      # Extract text from a PDF
+      #
+      # @param filename [String] filename to extract from
+      #
+      # @return [String] extracted text
+      def extract_from_pdf(filename)
+        cmd = pdf_to_text_cmd(filename)
+        logger.info("Executing: #{cmd}")
+        stdout, status = Open3.capture2(*cmd)
+        if status == 0
+          cleanup(stdout)
+        else
+          nil
+        end
+      end
+      # Build a command for the external PDF-to-text utility.
+      #
+      # @param filename [String] the pdf file
+      #
+      # @return [Array<String>] command and params to execute
+      def pdf_to_text_cmd(filename)
+        [Extractor.pdftotext_path, "-enc", "UTF-8", filename, "-"]
+      end
+      def extract_from_text(filename)
+        cleanup(File.read(filename))
+      end
+      # Run general once-off cleanup of extracted text.
+      def cleanup(text)
+        text = @cleanser.cleanup(text)
+        text = @cleanser.remove_empty_lines(text)
+        text = @cleanser.reformat(text)
+        text
+      end
+      # Get location of the pdftotext executable for all instances.
+      def self.pdftotext_path
+        @@pdftotext_path
+      end
+      # Set location of the pdftotext executable for all instances.
+      def self.pdftotext_path=(val)
+        @@pdftotext_path = val
+      end
+    end
+  end
+end

data/lib/slaw/parse/blocklists.rb CHANGED Viewed

@@ -3,28 +3,36 @@ module Slaw
     module Blocklists
       include Slaw::Namespace
-      # Correctly re-nest nested block lists.
+      # Correctly re-nest nested block lists. We do this by identifying the
+      # numbering format of each item in the list and comparing it with the
+      # surrounding elements. When the numbering format changes, we start
+      # a new nested list.
       #
-      # (a)
-      # (b)
-      # (i)
-      # (ii)
-      # (aa)
-      # (bb)
-      # (c)
-      # (d)
+      # We make sure to handle special cases such as `(i)` coming between
+      # `(h)` and `(j)` versus being at the start of a `(i), (ii), (iii)`
+      # list.
       #
-      # becomes
-      #
-      # (a)
-      # (b)
-      #   (i)
-      #   (ii)
+      #     (a)
+      #     (b)
+      #     (i)
+      #     (ii)
       #     (aa)
       #     (bb)
-      # (c)
-      # (d)
+      #     (c)
+      #     (d)
+      #
+      # becomes
+      #
+      #     (a)
+      #     (b)
+      #       (i)
+      #       (ii)
+      #         (aa)
+      #         (bb)
+      #     (c)
+      #     (d)
       #
+      # @param doc [Nokogiri::XML::Document] the document
       def self.nest_blocklists(doc)
         doc.xpath('//a:blockList', a: NS).each do |blocklist|
           items = blocklist.xpath('a:item', a: NS)

data/lib/slaw/parse/builder.rb CHANGED Viewed

@@ -1,25 +1,67 @@
+# encoding: UTF-8
 require 'builder'
 require 'treetop'
 module Slaw
   module Parse
-    # Primary class for building Akoma Ntoso documents.
+    # The primary class for building Akoma Ntoso documents from plain text documents.
+    #
+    # The builder uses a grammar to break down a plain-text version of an act into a
+    # syntax tree. This tree can then be serialized into an Akoma Ntoso compatible
+    # XML document.
+    #
+    # @example Parse some text into a well-formed document
+    #     builder = Slaw::Builder.new
+    #     xml = builder.parse_text(text)
+    #     doc = builder.parse_xml(xml)
+    #     builder.postprocess(doc)
+    #
+    # @example A quicker way to build a well-formed document
+    #     builder = Slaw::Builder.new
+    #     doc = builder.parse_and_process_text(text)
     #
-    # It can convert from plain text a new Akoma Ntoso document, or
-    # update existing documents.
     class Builder
       include Slaw::Namespace
       include Slaw::Logging
       Treetop.load(File.dirname(__FILE__) + "/bylaw.treetop")
+      # [Hash] A Hash of options that are made available to the parser when parsing.
       attr_accessor :parse_options
-      def initialize()
-        @parse_options = {}
+      def initialize(parse_options={})
+        @parse_options = parse_options
+      end
+      # Do all the work necessary to parse text into a well-formed XML document.
+      #
+      # @param text [String] the text to parse
+      # @param root [Symbol] the root element of the grammar
+      #
+      # @return [Nokogiri::XML::Document] a well formed document
+      def parse_and_process_text(text, root=:bylaw)
+        postprocess(parse_xml(parse_text(text, root)))
+      end
+      # Parse text into XML. You should still run {#postprocess} on the
+      # resulting XML to normalise it.
+      #
+      # @param text [String] the text to parse
+      # @param root [Symbol] the root element of the grammar
+      #
+      # @return [String] an XML string
+      def parse_text(text, root=:bylaw)
+        tree = text_to_syntax_tree(text, root)
+        xml_from_syntax_tree(tree)
       end
-      # Try to parse plain text into a syntax tree
+      # Parse plain text into a syntax tree.
+      #
+      # @param text [String] the text to parse
+      # @param root [Symbol] the root element of the grammar
+      #
+      # @return [Object] the root of the resulting parse tree, usually a Treetop::Node object
       def text_to_syntax_tree(text, root=:bylaw)
         parser = Slaw::Parse::BylawParser.new
         parser.options = @parse_options
@@ -35,7 +77,12 @@ module Slaw
         tree
       end
-      # Generate an XML document from the given syntax tree.
+      # Generate an XML document from the given syntax tree. You should still
+      # run {#postprocess} on the resulting XML to normalise it.
+      #
+      # @param tree [Object] a Treetop::Node object
+      #
+      # @return [String] an XML string
       def xml_from_syntax_tree(tree)
         s = ""
         builder = ::Builder::XmlMarkup.new(indent: 2, target: s)
@@ -50,38 +97,41 @@ module Slaw
         s
       end
+      # Parse a string into a Nokogiri::XML::Document
+      #
+      # @param xml [String] string to parse
+      #
+      # @return [Nokogiri::XML::Document]
       def parse_xml(xml)
         Nokogiri::XML(xml, &:noblanks)
       end
+      # Serialise a Nokogiri::XML::Document into a string
+      #
+      # @param doc [Nokogiri::XML::Document] document
+      #
+      # @return [String] pretty printed string
       def to_xml(doc)
         doc.to_xml(indent: 2)
       end
-      # Run various postprocesses on the XML, and return
-      # the updated XML.
+      # Postprocess an XML document.
+      #
+      # @param doc [Nokogiri::XML::Document]
+      #
+      # @return [Nokogiri::XML::Document] the updated document
       def postprocess(doc)
         normalise_headings(doc)
         find_short_title(doc)
-        sanitise(doc)
-      end
-      # Do sanitisations, such as finding and linking definitions
-      def sanitise(doc)
         link_definitions(doc)
         nest_blocklists(doc)
-      end
-      # recalculate ids for <term> elements
-      def renumber_terms(doc)
-        logger.info("Renumbering terms")
-        doc.xpath('//a:term', a: NS).each_with_index do |term, i|
-          term['id'] = "trm#{i}"
-        end
+        doc
       end
       # Change CAPCASE headings into Sentence case.
+      #
+      # @param doc [Nokogiri::XML::Document]
       def normalise_headings(doc)
         logger.info("Normalising headings")
@@ -94,6 +144,8 @@ module Slaw
       end
       # Find the short title and add it as an FRBRalias element in the meta section
+      #
+      # @param doc [Nokogiri::XML::Document]
       def find_short_title(doc)
         logger.info("Finding short title")
@@ -117,6 +169,8 @@ module Slaw
       # Find definitions of terms and introduce them into the
       # meta section of the document.
+      #
+      # @param doc [Nokogiri::XML::Document]
       def link_definitions(doc)
         logger.info("Finding and linking definitions")
@@ -126,6 +180,12 @@ module Slaw
         renumber_terms(doc)
       end
+      # Find `def` elements in the document and return a Hash from
+      # term ids to the text of each term
+      #
+      # @param doc [Nokogiri::XML::Document]
+      #
+      # @return [Hash{String, String}]
       def find_definitions(doc)
         guess_at_definitions(doc)
@@ -239,6 +299,21 @@ module Slaw
         end
       end
+      # recalculate ids for <term> elements
+      def renumber_terms(doc)
+        logger.info("Renumbering terms")
+        doc.xpath('//a:term', a: NS).each_with_index do |term, i|
+          term['id'] = "trm#{i}"
+        end
+      end
+      # Correctly nest blocklists.
+      #
+      # The grammar gives us flat blocklists, we need to introspect the
+      # numbering of the lists to correctly nest them.
+      #
+      # @param doc [Nokogiri::XML::Document]
       def nest_blocklists(doc)
         logger.info("Nesting blocklists")

data/lib/slaw/parse/bylaw.treetop CHANGED Viewed

@@ -1,3 +1,5 @@
+# encoding: UTF-8
 require 'slaw/parse/grammar_helpers'
 module Slaw

data/lib/slaw/parse/cleanser.rb CHANGED Viewed

@@ -50,16 +50,17 @@ module Slaw
          .gsub("", '')
       end
+      # change weird quotes to normal ones
       def fix_quotes(s)
-        # change weird quotes to normal ones
         s.gsub(/‘‘|’’|''/, '"')
       end
+      # tabs to spaces
       def expand_tabs(s)
-        # tabs to spaces
         s.gsub(/\t/, ' ')
       end
+      # Try to remove boilerplate lines found in many files, such as page numbers.
       def remove_boilerplate(s)
         # nuke any line to do with Sabinet and the government printer
         s.gsub(/^.*Sabinet.*Government Printer.*$/i, '')\
@@ -72,6 +73,8 @@ module Slaw
          .gsub(/^\s*page \d+( of \d+)?\s*\n/i, '')
       end
+      # Get rid of whitespace at the end of lines and at the start and end of the
+      # entire string.
       def chomp(s)
         # trailing whitespace at end of lines
         s = s.gsub(/ +$/, '')
@@ -85,8 +88,11 @@ module Slaw
         s.end_with?("\n") ? s : (s + "\n")
       end
-      # make educated guesses about lines that should
-      # have been broken but haven't, and break them
+      # Make educated guesses about lines that should
+      # have been broken but haven't, and break them.
+      #
+      # This is very dependent on a locale's legislation grammar, there are
+      # lots of rules of thumb that make this work.
       def break_lines(s)
         # often we find a section title munged onto the same line as its first statement
         # eg:
@@ -115,8 +121,8 @@ module Slaw
         s
       end
-      # finds likely candidates for unnecessarily broken lines
-      # and  them
+      # Find likely candidates for unnecessarily broken lines
+      # and unbreaks them.
       def unbreak_lines(s)
         lines = s.split(/\n/)
         output = []
@@ -141,8 +147,8 @@ module Slaw
         output.join("\n")
       end
-      # do our best to remove table of contents at the start,
-      # it really confuses the grammer
+      # Do our best to remove table of contents at the start,
+      # it really confuses the grammer.
       def strip_toc(s)
         # first, try to find 'TABLE OF CONTENTS' anywhere within the first 4K of text,
         if toc_start = s[0..4096].match(/TABLE OF CONTENTS/i)

data/lib/slaw/parse/grammar_helpers.rb CHANGED Viewed

@@ -1,5 +1,9 @@
 module Slaw
   module Parse
+    # These helpers are mixed into the treetop grammar and provide a means for
+    # exposing options into the grammar.
+    #
+    # @see Builder#parse_options
     module GrammarHelpers
       attr_writer :options

data/lib/slaw/render/html.rb CHANGED Viewed

@@ -2,7 +2,14 @@ module Slaw
   module Render
     # Support for transforming XML AN documents into HTML.
+    #
+    # This rendering is done using XSLT stylesheets. Both an entire
+    # document and fragments can be rendered.
     class HTMLRenderer
+      # [Hash] A Hash of Nokogiri::XSLT objects
+      attr_accessor :xslt
       def initialize
         here = File.dirname(__FILE__)
@@ -12,12 +19,17 @@ module Slaw
         }
       end
-      # Transform an entire XML document +doc+ (a Nokogiri::XML::Document object) into HTML.
-      # Specify +base_url+ to manage the base for relative URLs generated by
+      # Transform an entire XML document (a Nokogiri::XML::Document object) into HTML.
+      # Specify `base_url` to manage the base for relative URLs generated by
       # the transform.
+      #
+      # @param doc [Nokogiri::XML::Document] document to render
+      # @param base_url [String] root URL for relative URLs (cannot be empty)
+      #
+      # @return [String]
       def render(doc, base_url='')
-        params = transform_params({'base_url' => base_url})
-        run_xslt(:act, doc, params)
+        params = _transform_params({'base_url' => base_url})
+        _run_xslt(:act, doc, params)
       end
       # Transform just a single node and its children into HTML.
@@ -25,8 +37,13 @@ module Slaw
       # If +elem+ has an id, we use xpath to tell the XSLT which
       # element to transform. Otherwise we copy the node into a new
       # tree and apply the XSLT to that.
+      #
+      # @param node [Nokogiri::XML::Node] node to render
+      # @param base_url [String] root URL for relative URLs (cannot be empty)
+      #
+      # @return [String]
       def render_node(node, base_url='')
-        params = transform_params({'base_url' => base_url})
+        params = _transform_params({'base_url' => base_url})
         if node.id
           params += ['root_elem', "//*[@id='#{node.id}']"]
@@ -38,14 +55,14 @@ module Slaw
           params += ['root_elem', '*']
         end
-        run_xslt(:fragment, doc, params)
+        _run_xslt(:fragment, doc, params)
       end
-      def run_xslt(xslt, doc, params)
+      def _run_xslt(xslt, doc, params)
         @xslt[xslt].transform(doc, params).to_s
       end
-      def transform_params(params)
+      def _transform_params(params)
         Nokogiri::XSLT.quote_params(params)
       end
     end

data/lib/slaw/render/xsl/elements.xsl CHANGED Viewed

@@ -77,7 +77,7 @@
         <xsl:value-of select="@refersTo" />
       </xsl:attribute>
-      <xsl:attribute name="href"><xsl:value-of select="$base_url" />definitions/#def-<xsl:value-of select="translate(@refersTo, '#', '')" /></xsl:attribute>
+      <xsl:attribute name="href"><xsl:value-of select="$base_url" />/definitions/#def-<xsl:value-of select="translate(@refersTo, '#', '')" /></xsl:attribute>
       <xsl:apply-templates />
     </a>

data/lib/slaw/version.rb CHANGED Viewed

@@ -1,3 +1,3 @@
 module Slaw
-  VERSION = "0.1.2"
+  VERSION = "0.1.3"
 end

data/slaw.gemspec CHANGED Viewed

@@ -23,7 +23,6 @@ Gem::Specification.new do |spec|
   spec.add_development_dependency "rspec", "~> 2.14.1"
   spec.add_runtime_dependency "nokogiri", "~> 1.6.0"
-  spec.add_runtime_dependency "elasticsearch", "~> 1.0.5"
   spec.add_runtime_dependency "treetop", "~> 1.5"
   spec.add_runtime_dependency "builder", "~> 3.2.2"
   spec.add_runtime_dependency "log4r", "~> 1.1.10"

data/spec/extract/extractor_spec.rb ADDED Viewed

@@ -0,0 +1,14 @@
+require 'tempfile'
+require 'spec_helper'
+require 'slaw'
+describe Slaw::Extract::Extractor do
+  it 'should extract from plain text' do
+    f = Tempfile.new(['test', '.txt'])
+    f.write('This is some text')
+    f.rewind
+    subject.extract_from_file(f.path).should == "This is some text\n"
+  end
+end

data/spec/parse/builder_spec.rb CHANGED Viewed

@@ -1,3 +1,5 @@
+# encoding: UTF-8
 require 'spec_helper'
 require 'slaw'

data/spec/parse/bylaw_spec.rb CHANGED Viewed

@@ -1,3 +1,5 @@
+# encoding: UTF-8
 require 'slaw'
 require 'builder'

data/spec/parse/cleanser_spec.rb CHANGED Viewed

@@ -1,3 +1,5 @@
+# encoding: UTF-8
 require 'spec_helper'
 require 'slaw'

metadata CHANGED Viewed

@@ -1,14 +1,14 @@
 --- !ruby/object:Gem::Specification
 name: slaw
 version: !ruby/object:Gem::Version
-  version: 0.1.2
+  version: 0.1.3
 platform: ruby
 authors:
 - Greg Kempe
 autorequire:
 bindir: bin
 cert_chain: []
-date: 2014-09-17 00:00:00.000000000 Z
+date: 2014-09-18 00:00:00.000000000 Z
 dependencies:
 - !ruby/object:Gem::Dependency
   name: bundler
@@ -66,20 +66,6 @@ dependencies:
     - - "~>"
       - !ruby/object:Gem::Version
         version: 1.6.0
-- !ruby/object:Gem::Dependency
-  name: elasticsearch
-  requirement: !ruby/object:Gem::Requirement
-    requirements:
-    - - "~>"
-      - !ruby/object:Gem::Version
-        version: 1.0.5
-  type: :runtime
-  prerelease: false
-  version_requirements: !ruby/object:Gem::Requirement
-    requirements:
-    - - "~>"
-      - !ruby/object:Gem::Version
-        version: 1.0.5
 - !ruby/object:Gem::Dependency
   name: treetop
   requirement: !ruby/object:Gem::Requirement
@@ -131,6 +117,7 @@ extensions: []
 extra_rdoc_files: []
 files:
 - ".gitignore"
+- ".travis.yml"
 - Gemfile
 - LICENSE.txt
 - README.md
@@ -139,7 +126,7 @@ files:
 - lib/slaw/act.rb
 - lib/slaw/bylaw.rb
 - lib/slaw/collection.rb
-- lib/slaw/elasticsearch.rb
+- lib/slaw/extract/extractor.rb
 - lib/slaw/lifecycle_event.rb
 - lib/slaw/logging.rb
 - lib/slaw/namespace.rb
@@ -157,6 +144,7 @@ files:
 - lib/slaw/version.rb
 - lib/slaw/xml_support.rb
 - slaw.gemspec
+- spec/extract/extractor_spec.rb
 - spec/parse/builder_spec.rb
 - spec/parse/bylaw_spec.rb
 - spec/parse/cleanser_spec.rb
@@ -187,6 +175,7 @@ signing_key:
 specification_version: 4
 summary: A lightweight library for using Akoma Ntoso acts in Ruby.
 test_files:
+- spec/extract/extractor_spec.rb
 - spec/parse/builder_spec.rb
 - spec/parse/bylaw_spec.rb
 - spec/parse/cleanser_spec.rb

data/lib/slaw/elasticsearch.rb DELETED Viewed

@@ -1,107 +0,0 @@
-require 'elasticsearch'
-require 'log4r'
-module Slaw
-  # Support for indexing and search using elasticsearch
-  class ElasticSearchSupport
-    attr_accessor :es, :mapping, :index, :type, :base_url
-    def initialize(index, type, base_url, client_params={}, es=nil)
-      @es = es || create_client(client_params)
-      @ix = index
-      @type = type
-      @base_url = base_url
-      @mapping = {
-        frbr_uri: {type: 'string', index: 'not_analyzed'},
-        url: {type: 'string', index: 'not_analyzed'},
-        title: {type: 'string', analyzer: 'english'},
-        content: {type: 'string', analyzer: 'english'},
-        published_on: {type: 'date', format: 'dateOptionalTime'},
-        region: {type: 'string', index: 'not_analyzed'},
-        region_name: {type: 'string', index: 'not_analyzed'},
-        repealed: {type: 'boolean'},
-      }
-      @log = Log4r::Logger['Slaw']
-    end
-    def create_client(client_params)
-      Elasticsearch::Client.new(client_params)
-    end
-    def reindex!(docs, &block)
-      define_mapping!
-      index_documents!(docs, &block)
-    end
-    def index_documents!(docs, &block)
-      for doc in docs
-        id = doc.id_uri.gsub('/', '-')
-        data = {
-          frbr_uri: doc.id_uri,
-          url: @base_url + doc.id_uri,
-          title: doc.short_title,
-          content: doc.body.text,
-          region: doc.region,
-          published_on: doc.publication['date'],
-          repealed: doc.repealed?,
-        }
-        yield doc, data if block_given?
-        @log.info("Indexing #{id}")
-        @es.index(index: @ix, type: @type, id: id, body: data)
-      end
-    end
-    def define_mapping!
-      @log.info("Deleting index")
-      @es.indices.create(index: @ix) unless @es.indices.exists(index: @ix)
-      # delete existing mapping
-      unless @es.indices.get_mapping(index: @ix, type: @type).empty?
-        @es.indices.delete_mapping(index: @ix, type: @type)
-      end
-      @log.info("Defining mappings")
-      @es.indices.put_mapping(index: @ix, type: @type, body: {
-        @type => {properties: @mapping}
-      })
-    end
-    def search(q, from=0, size=10)
-      @es.search(index: @ix, body: {
-        query: {
-          multi_match: {
-            query: q,
-            type: 'cross_fields',
-            fields: ['title', 'content'],
-          }
-        },
-        fields: ['frbr_uri', 'repealed', 'published_on', 'title', 'url', 'region_name'],
-        highlight: {
-          order: "score",
-          fields: {
-            content: {
-              fragment_size: 80,
-              number_of_fragments: 2,
-            },
-            title: {
-              number_of_fragments: 0, # entire field
-            }
-          },
-          pre_tags: ['<mark>'],
-          post_tags: ['</mark>'],
-        },
-        from: from,
-        size: size,
-        sort: {
-          '_score' => {order: 'desc'}
-        }
-      })
-    end
-  end
-end