RubyGems - slaw - Versions diffs - 0.17.2 → 1.0.0.alpha.1 - Mend

slaw 0.17.2 → 1.0.0.alpha.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (29) hide show

checksums.yaml +4 -4
data/README.md +9 -2
data/bin/slaw +2 -19
data/lib/slaw/generator.rb +21 -6
data/lib/slaw/grammars/core_nodes.rb +17 -0
data/lib/slaw/grammars/inlines.treetop +45 -0
data/lib/slaw/grammars/inlines_nodes.rb +58 -0
data/lib/slaw/grammars/pl/act.treetop +246 -0
data/lib/slaw/grammars/pl/act_nodes.rb +469 -0
data/lib/slaw/grammars/schedules.treetop +33 -0
data/lib/slaw/grammars/schedules_nodes.rb +107 -0
data/lib/slaw/grammars/tables.treetop +59 -0
data/lib/slaw/grammars/tables_nodes.rb +74 -0
data/lib/slaw/grammars/terminals.treetop +84 -0
data/lib/slaw/grammars/za/act.treetop +222 -0
data/lib/slaw/grammars/za/act_nodes.rb +307 -0
data/lib/slaw/{za → grammars/za}/act_text.xsl +0 -0
data/lib/slaw/parse/builder.rb +6 -202
data/lib/slaw/version.rb +1 -1
data/spec/generator_spec.rb +2 -0
data/spec/parse/builder_spec.rb +0 -48
data/spec/pl/act_block_spec.rb +449 -0
data/spec/za/act_block_spec.rb +5 -3
data/spec/za/act_inline_spec.rb +2 -0
data/spec/za/act_schedules_spec.rb +2 -0
data/spec/za/act_table_spec.rb +2 -0
metadata +19 -7
data/lib/slaw/za/act.treetop +0 -393
data/lib/slaw/za/act_nodes.rb +0 -532

checksums.yaml CHANGED

@@ -1,7 +1,7 @@
 ---
 SHA1:
-  metadata.gz: 60e83b6293761721e7a2213e9e5c668accc5873c
-  data.tar.gz: b8e31cc3f17512a32af5426a86f87c68c14a0b45
+  metadata.gz: e5c6f9929b92711f0a7c608387b49784c8c0198c
+  data.tar.gz: 7557e00919085931eae2ce9195bbe55342a50c9c
 SHA512:
-  metadata.gz: efad800e07b95ae255fe44ccf118f5dc43793b88394d5eec87b59ddc30ef0222aabbd2e513c7a5dd3c6b34a271e77dccb599f10e2adf8c2faa0f0a83c67f65bf
-  data.tar.gz: c5c44d83a2c736f62122a96301435517999ba008768f3d54642b828f9897163172da7202691bd32a11a5427613f90a1ae0e5863ae375f6a3bcdf0514616b4fad
+  metadata.gz: 386d5e7195f18838c00af784a46792f349913ac5fe07f7799e4e3055fd6720da3e1dcb6b2e10b93d14e2519758eecdd4b46fe31ceecf2bf10b707cf51f6e93dd
+  data.tar.gz: 73aa8a5060a8933a09bec8ebac589aeb835de216a76f7865f9bf5883eb89ab347117c648eda8e806eaa668d4f9c1ce7eefeb4bd9dd678abbcb1a200986d98063

data/README.md CHANGED

@@ -8,7 +8,7 @@ Slaw allows you to:
 1. extract plain text from PDFs and clean up that text
 2. parse plain text and transform it into an Akoma Ntoso Act XML document
-3. render the XML document into HTML
+3. unparse Akoma Ntoso XML into text that can be parsed backed into Akoma Ntoso.
 Slaw is lightweight because it wraps around a Nokogiri XML representation of
 the parsed document. It provides some support methods for manipulating these
@@ -61,7 +61,7 @@ formats.
 The grammar cannot catch some subtleties of an act or by-law -- such as nested list numbering --
 so Slaw performs some post-processing on the XML produced by the parser. In particular,
-it nests lists correctly and looks for specially defined terms and their occurrences in the document.
+it nests lists correctly.
 ## Quick Start
@@ -218,6 +218,13 @@ Akoma Ntoso `component` elements at the end of the XML document, with a name of
 ## Changelog
+### 1.0.0
+* Improved support for other legal traditions / grammars.
+* Add Polish legal tradition grammar.
+* Slaw no longer does too much introspection of a parsed document, since that can be so tradition-dependent.
+* Remove definition linking, Slaw no longer supports it.
 ### 0.17.2
 * Match defined terms in 'definition' section.

data/bin/slaw CHANGED

@@ -17,19 +17,15 @@ class SlawCLI < Thor
   desc "parse FILE", "Parse FILE into Akoma Ntoso XML"
   option :input, enum: ['text', 'pdf'], desc: "Type of input if it can't be determined automatically"
   option :pdftotext, desc: "Location of the pdftotext binary if not in PATH"
-  option :definitions, type: :boolean, desc: "Find and link definitions (this can be slow). Default: false"
   option :fragment, type: :string, desc: "Akoma Ntoso element name that the imported text represents. Support depends on the grammar."
   option :id_prefix, type: :string, desc: "Prefix to be used when generating ID elements when parsing a fragment."
   option :section_number_position, enum: ['before-title', 'after-title', 'guess'], desc: "Where do section titles come in relation to the section number? Default: before-title"
   option :reformat, type: :boolean, desc: "Reformat common formatting issues to make grammar matching better. Default: true for PDF files, false otherwise"
   option :crop, type: :string, desc: "Crop box for PDF files, as 'left,top,width,height'."
+  option :grammar, type: :string, desc: "Grammar name (usually a two-letter country code). Default is za."
   def parse(name)
     logging
-    if options[:fragment] and options[:definitions]
-      raise Thor::Error.new("--definitions can't be used together with --fragment")
-    end
     Slaw::Extract::Extractor.pdftotext_path = options[:pdftotext] if options[:pdftotext]
     extractor = Slaw::Extract::Extractor.new
@@ -50,7 +46,7 @@ class SlawCLI < Thor
       text = extractor.extract_from_file(name)
     end
-    generator = Slaw::ActGenerator.new
+    generator = Slaw::ActGenerator.new(options[:grammar] || 'za')
     text = generator.reformat(text) if options[:reformat]
@@ -94,22 +90,9 @@ class SlawCLI < Thor
       exit 1
     end
-    # definitions?
-    generator.builder.link_definitions(act.doc) if options[:definitions]
     puts act.to_xml(indent: 2)
   end
-  desc "link-definitions FILE", "Find and link defined terms in FILE"
-  def link_definitions(name)
-    builder = Slaw::ActGenerator.new.builder
-    doc = File.open(name, 'r') { |f| doc = builder.parse_xml(f.read) }
-    builder.link_definitions(doc)
-    puts builder.to_xml(doc)
-  end
   desc "unparse FILE", "Unparse FILE from Akoma Ntoso XML back into text suitable for re-parsing"
   def unparse(name)
     generator = Slaw::ActGenerator.new

data/lib/slaw/generator.rb CHANGED

@@ -1,8 +1,6 @@
 module Slaw
   # Base class for generating Act documents
   class ActGenerator
-    Treetop.load(File.dirname(__FILE__) + "/za/act.treetop")
     # [Treetop::Runtime::CompiledParser] compiled parser
     attr_accessor :parser
@@ -12,13 +10,31 @@ module Slaw
     # The type that will hold the generated document
     attr_accessor :document_class
-    def initialize
-      @parser = Slaw::ZA::ActParser.new
+    @@parsers = {}
+    def initialize(grammar)
+      @grammar = grammar
+      @parser = build_parser
       @builder = Slaw::Parse::Builder.new(parser: @parser)
+      @parser = @builder.parser
       @cleanser = Slaw::Parse::Cleanser.new
       @document_class = Slaw::Act
     end
+    def build_parser
+      unless @@parsers[@grammar]
+        # load the grammar
+        grammar_file = File.dirname(__FILE__) + "/grammars/#{@grammar}/act.treetop"
+        Treetop.load(grammar_file)
+        grammar_class = "Slaw::Grammars::#{@grammar.upcase}::ActParser"
+        @@parsers[@grammar] = eval(grammar_class)
+      end
+      @parser = @@parsers[@grammar].new
+    end
     # Generate a Slaw::Act instance from plain text.
     #
     # @param text [String] plain text
@@ -66,8 +82,7 @@ module Slaw
     # Transform an Akoma Ntoso XML document back into a plain-text version
     # suitable for re-parsing back into XML with no loss of structure.
     def text_from_act(doc)
-      here = File.dirname(__FILE__)
-      xslt = Nokogiri::XSLT(File.read(File.join([here, 'za/act_text.xsl'])))
+      xslt = Nokogiri::XSLT(File.read(File.join([File.dirname(__FILE__), "grammars/#{@grammar}/act_text.xsl"])))
       xslt.transform(doc).child.to_xml
     end
   end

data/lib/slaw/grammars/core_nodes.rb ADDED

@@ -0,0 +1,17 @@
+module Slaw
+  module Grammars
+    class GroupNode < Treetop::Runtime::SyntaxNode
+      def to_xml(b, *args)
+        children.elements.each { |e| e.to_xml(b, *args) }
+      end
+    end
+    class Body < Treetop::Runtime::SyntaxNode
+      def to_xml(b)
+        b.body { |b|
+          children.elements.each_with_index { |e, i| e.to_xml(b, '', i) }
+        }
+      end
+    end
+  end
+end

data/lib/slaw/grammars/inlines.treetop ADDED

@@ -0,0 +1,45 @@
+# encoding: UTF-8
+require 'slaw/grammars/terminals'
+require 'slaw/grammars/inlines_nodes'
+module Slaw
+  module Grammars
+    grammar Inlines
+      ##########
+      # inline content
+      rule inline_statement
+        space? '\\'? clauses eol
+        <NakedStatement>
+      end
+      # one or more words, allowing inline elements
+      rule clauses
+        (remark / image / ref / [^\n])+
+        <Clauses>
+      end
+      rule remark
+        '[[' content:(ref / (!']]' .))+ ']]'
+        <Remark>
+      end
+      rule image
+        # images like markdown
+        # eg. ![title text](image url)
+        #
+        # the title text is optional, but the enclosing square brackets aren't
+        '![' content:(!'](' [^\n])* '](' href:([^)\n]+) ')'
+        <Image>
+      end
+      rule ref
+        # links like markdown
+        # eg. [link text](link url)
+        '[' content:(!'](' [^\n])+ '](' href:([^)\n]+) ')'
+        <Ref>
+      end
+    end
+  end
+end

data/lib/slaw/grammars/inlines_nodes.rb ADDED

@@ -0,0 +1,58 @@
+module Slaw
+  module Grammars
+    module Inlines
+      class NakedStatement < Treetop::Runtime::SyntaxNode
+        def to_xml(b, idprefix, i=0)
+          b.p { |b| clauses.to_xml(b, idprefix) } if clauses
+        end
+        def content
+          clauses
+        end
+      end
+      class Clauses < Treetop::Runtime::SyntaxNode
+        def to_xml(b, idprefix=nil)
+          for e in elements
+            if e.respond_to? :to_xml
+              e.to_xml(b, idprefix)
+            else
+              b << e.text_value
+            end
+          end
+        end
+      end
+      class Remark < Treetop::Runtime::SyntaxNode
+        def to_xml(b, idprefix)
+          b.remark(status: 'editorial') do |b|
+            b << '['
+            for e in content.elements
+              if e.respond_to? :to_xml
+                e.to_xml(b, idprefix)
+              else
+                b << e.text_value
+              end
+            end
+            b << ']'
+          end
+        end
+      end
+      class Image < Treetop::Runtime::SyntaxNode
+        def to_xml(b, idprefix)
+          attrs = {src: href.text_value}
+          attrs[:alt] = content.text_value unless content.text_value.empty?
+          b.img(attrs)
+        end
+      end
+      class Ref < Treetop::Runtime::SyntaxNode
+        def to_xml(b, idprefix)
+          b.ref(content.text_value, href: href.text_value)
+        end
+      end
+    end
+  end
+end

data/lib/slaw/grammars/pl/act.treetop ADDED

@@ -0,0 +1,246 @@
+# encoding: UTF-8
+require 'slaw/parse/grammar_helpers'
+require 'slaw/grammars/pl/act_nodes'
+require 'slaw/grammars/terminals'
+require 'slaw/grammars/tables'
+require 'slaw/grammars/schedules'
+require 'slaw/grammars/inlines'
+module Slaw
+  module Grammars
+    module PL
+      grammar Act
+        include Slaw::Parse::GrammarHelpers
+        ########
+        # major containers
+        rule act
+          empty_line*
+          preface:preface?
+          preamble:preamble?
+          body
+          schedules:schedules_container? <Act>
+        end
+        rule preface
+          !'PREAMBLE'
+          ('PREFACE'i space? eol)?
+          statements:(!'PREAMBLE' pre_body_statement)* <Preface>
+        end
+        rule preamble
+          'PREAMBLE'i space? eol
+          statements:pre_body_statement* <Preamble>
+        end
+        rule body
+          children:(division / subdivision / chapter / article / section / paragraph / point / litera / block_paragraphs)+ <Body>
+        end
+        rule division
+          heading:division_heading
+          children:(subdivision / chapter / article / section / paragraph / point / litera / block_paragraphs)*
+          <Division>
+        end
+        rule subdivision
+          heading:subdivision_heading
+          children:(chapter / article / section / paragraph / point / litera / block_paragraphs)*
+          <Subdivision>
+        end
+        rule chapter
+          heading:chapter_heading
+          children:(article / section / paragraph / point / litera / block_paragraphs)*
+          <Chapter>
+        end
+        rule article
+          # Art. 55. 1. something
+          article_prefix whitespace
+          intro:block_element? eol?
+          children:(section / paragraph / point / litera / block_paragraphs)* <Article>
+        end
+        rule section
+          # § 55. foo
+          section_prefix whitespace
+          intro:block_element? eol?
+          children:(paragraph / point / litera / block_paragraphs)* <Section>
+        end
+        rule paragraph
+          # ustęp:
+          #  34. ...
+          paragraph_prefix space?
+          intro:block_element? eol?
+          children:(point / litera / block_paragraphs)* <Paragraph>
+        end
+        rule point
+          # 12) aoeuaoeu
+          # 12a) aoeuaoeu
+          point_prefix whitespace
+          intro:block_element? eol?
+          children:(litera / block_paragraphs)* <Point>
+        end
+        rule litera
+          # a) aoeuaoeu
+          litera_prefix whitespace
+          intro:block_element? eol?
+          children:block_paragraphs* <Litera>
+        end
+        ##########
+        # group elements
+        #
+        # these are used externally and provide support when parsing just
+        # a particular portion of a document
+        rule divisions
+          children:division+ <GroupNode>
+        end
+        rule subdivisions
+          children:subdivision+ <GroupNode>
+        end
+        rule chapters
+          children:chapter+ <GroupNode>
+        end
+        rule articles
+          children:article+ <GroupNode>
+        end
+        rule sections
+          children:section+ <GroupNode>
+        end
+        rule paragraphs
+          children:paragraph+ <GroupNode>
+        end
+        rule points
+          children:point+ <GroupNode>
+        end
+        ##########
+        # headings
+        rule division_heading
+          space? prefix:division_heading_prefix heading:(newline? content)? eol
+          <GenericHeading>
+        end
+        rule subdivision_heading
+          space? prefix:subdivision_heading_prefix heading:(newline? content)? eol
+          <GenericHeading>
+        end
+        rule chapter_heading
+          space? prefix:chapter_heading_prefix heading:(newline? content)? eol
+          <GenericHeading>
+        end
+        ##########
+        # blocks of content inside containers
+        rule block_paragraphs
+          block_element+ <BlockParagraph>
+        end
+        rule block_element
+          # XXX: blocklist
+          (table / naked_statement)
+        end
+        # Block elements that don't have to appear at the start of a line.
+        # ie. we don't need to guard against the start of a chapter, section, etc.
+        rule inline_block_element
+          # XXX: blocklist
+          (table / inline_statement)
+        end
+        rule blocklist
+          blocklist_item+ <Blocklist>
+        end
+        rule blocklist_item
+          # TODO: this whitespace should probably be space, to allow empty blocklist items followed by plain text
+          space? blocklist_item_prefix whitespace item_content:(!blocklist_item_prefix clauses:clauses? eol)? eol?
+          <BlocklistItem>
+        end
+        rule blocklist_item_prefix
+          ('(' letter_ordinal ')') / dotted_number_3
+        end
+        ##########
+        # statements - single lines of content
+        #
+        # If a statement starts with a backslash, it's considered to have escaped the subsequent word,
+        # and is ignored. This allows escaping of section headings, etc.
+        rule naked_statement
+          space? !(division_heading / subdivision_heading / chapter_heading / article_prefix / section_prefix / schedule_title / paragraph_prefix / point_prefix / litera_prefix) '\\'? clauses eol
+          <NakedStatement>
+        end
+        rule pre_body_statement
+          space? !(division_heading / subdivision_heading / chapter_heading / article_prefix / section_prefix / schedule_title) '\\'? clauses eol
+          <NakedStatement>
+        end
+        ##########
+        # prefixes
+        rule division_heading_prefix
+          'dzia'i ('ł'/'Ł') space alphanums [ :-]*
+        end
+        rule subdivision_heading_prefix
+          'oddzia'i ('ł'/'Ł') space alphanums [ :.-]*
+        end
+        rule chapter_heading_prefix
+          'rozdzia'i ('ł'/'Ł') space alphanums [ :.-]*
+        end
+        rule article_prefix
+          ('Art.'i / ('Artyku'i 'ł'/'Ł')) space number_letter '.'?
+        end
+        rule section_prefix
+          '§' space alphanums '.'?
+        end
+        rule paragraph_prefix
+          number_letter '.'
+        end
+        rule point_prefix
+          # 1) foo
+          # 2A) foo
+          number_letter ')'
+        end
+        rule litera_prefix
+          # a) foo
+          # bb) foo
+          letters:letter+ ')'
+        end
+        include Slaw::Grammars::Inlines
+        include Slaw::Grammars::Tables
+        include Slaw::Grammars::Schedules
+        include Slaw::Grammars::Terminals
+      end
+    end
+  end
+end