RubyGems - llt-segmenter - Versions diffs - 0.0.1 - Mend

llt-segmenter 0.0.1

Files changed (19) hide show

checksums.yaml +7 -0
data/.gitignore +17 -0
data/.rspec +3 -0
data/.travis.yml +6 -0
data/Gemfile +15 -0
data/LICENSE +20 -0
data/README.md +45 -0
data/Rakefile +7 -0
data/config/warble.rb +162 -0
data/config.ru +2 -0
data/lib/llt/segmenter/api.rb +20 -0
data/lib/llt/segmenter/version.rb +5 -0
data/lib/llt/segmenter.rb +97 -0
data/lib/llt/sentence.rb +10 -0
data/llt-segmenter.gemspec +29 -0
data/spec/lib/llt/segmenter/api_spec.rb +56 -0
data/spec/lib/llt/segmenter_spec.rb +259 -0
data/spec/spec_helper.rb +26 -0
metadata +134 -0

checksums.yaml ADDED Viewed

@@ -0,0 +1,7 @@
+---
+SHA1:
+  metadata.gz: 4759c4666499718fd9364ff782a16f5d7ef818d1
+  data.tar.gz: cb27484995b057c79017ac11740dc68284f0085b
+SHA512:
+  metadata.gz: abb24d73dc6029e91bb5ba14d81fcd0244ec7cc1619d7848bddff4df785c38dcd1c069877e368930289a248e3e79a905a3d4771e7b7fa6b7087c33427c04d54d
+  data.tar.gz: dd8daa449bb083a62ebd6656c6c6221e2af9c36cb0d22fdd95775b8e781f312125857380abc43a854c6e282cda797f77331afe65f0fd5b2f04d6d5e4d417b389

data/.gitignore ADDED Viewed

@@ -0,0 +1,17 @@
+*.gem
+*.rbc
+.bundle
+.config
+.yardoc
+Gemfile.lock
+InstalledFiles
+_yardoc
+coverage
+doc/
+lib/bundler/man
+pkg
+rdoc
+spec/reports
+test/tmp
+test/version_tmp
+tmp

data/.rspec ADDED Viewed

@@ -0,0 +1,3 @@
+--format documentation
+--color
+--tty

data/.travis.yml ADDED Viewed

@@ -0,0 +1,6 @@
+language: ruby
+before_script:
+  - "export JRUBY_OPTS=--2.0"
+rvm:
+  - 2.0.0
+  - jruby-20mode

data/Gemfile ADDED Viewed

@@ -0,0 +1,15 @@
+source 'https://rubygems.org'
+# Specify your gem's dependencies in llt-segmenter.gemspec
+gemspec
+gem 'pry'
+gem 'coveralls', require: false
+gem 'llt-core', git: 'https://github.com/latin-language-toolkit/llt-core.git'
+gem 'llt-constants', git: 'https://github.com/latin-language-toolkit/llt-constants.git'
+gem 'llt-logger', git: 'https://github.com/latin-language-toolkit/llt-logger.git'
+platform :jruby do
+  gem 'jruby-httpclient'
+end

data/LICENSE ADDED Viewed

@@ -0,0 +1,20 @@
+The MIT License (MIT)
+Copyright (c) 2013 latin-language-toolkit
+Permission is hereby granted, free of charge, to any person obtaining a copy of
+this software and associated documentation files (the "Software"), to deal in
+the Software without restriction, including without limitation the rights to
+use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of
+the Software, and to permit persons to whom the Software is furnished to do so,
+subject to the following conditions:
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS
+FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR
+COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER
+IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.

data/README.md ADDED Viewed

@@ -0,0 +1,45 @@
+# LLT::Segmenter
+[![Version](http://allthebadges.io/latin-language-toolkit/llt-segmenter/badge_fury.png)](http://allthebadges.io/latin-language-toolkit/llt-segmenter/badge_fury)
+[![Build Status](https://travis-ci.org/latin-language-toolkit/llt-segmenter.png?branch=master)](https://travis-ci.org/latin-language-toolkit/llt-segmenter)
+[![Dependencies](http://allthebadges.io/latin-language-toolkit/llt-segmenter/gemnasium.png)](http://allthebadges.io/latin-language-toolkit/llt-segmenter/gemnasium)
+[![Coverage](https://coveralls.io/repos/latin-language-toolkit/llt-segmenter/badge.png?branch=master)](https://coveralls.io/r/latin-language-toolkit/llt-segmenter?branch=master)
+[![Code Climate](https://codeclimate.com/github/latin-language-toolkit/llt-segmenter.png)](https://codeclimate.com/github/latin-language-toolkit/llt-segmenter)
+Segments text into sentences.
+## Installation
+Add this line to your application's Gemfile:
+    gem 'llt-segmenter'
+And then execute:
+    $ bundle
+Or install it yourself as:
+    $ gem install llt-segmenter
+## Usage
+TODO: Write usage instructions here
+## API
+This currently is a list of requirements and will transform into an API documentation.
+Input:
+- Text or (URI)
+- Black-/Whitelist for separators.
+Output:
+- XML (TEI) or JSON
+## Contributing
+1. Fork it
+2. Create your feature branch (`git checkout -b my-new-feature`)
+3. Commit your changes (`git commit -am 'Add some feature'`)
+4. Push to the branch (`git push origin my-new-feature`)
+5. Create new Pull Request

data/Rakefile ADDED Viewed

@@ -0,0 +1,7 @@
+require "bundler/gem_tasks"
+require "rspec/core/rake_task"
+RSpec::Core::RakeTask.new(:spec)
+task :default => :spec

data/config/warble.rb ADDED Viewed

@@ -0,0 +1,162 @@
+# Disable Rake-environment-task framework detection by uncommenting/setting to false
+# Warbler.framework_detection = false
+# Warbler web application assembly configuration file
+Warbler::Config.new do |config|
+  # Features: additional options controlling how the jar is built.
+  # Currently the following features are supported:
+  # - gemjar: package the gem repository in a jar file in WEB-INF/lib
+  # - executable: embed a web server and make the war executable
+  # - compiled: compile .rb files to .class files
+  # config.features = %w(gemjar)
+  # Application directories to be included in the webapp.
+  # config.dirs = %w(app config db lib log script vendor tmp)
+  # Additional files/directories to include, above those in config.dirs
+  # config.includes = FileList["db"]
+  # Additional files/directories to exclude
+  # config.excludes = FileList["lib/tasks/*"]
+  # Additional Java .jar files to include.  Note that if .jar files are placed
+  # in lib (and not otherwise excluded) then they need not be mentioned here.
+  # JRuby and JRuby-Rack are pre-loaded in this list.  Be sure to include your
+  # own versions if you directly set the value
+  # config.java_libs += FileList["lib/java/*.jar"]
+  # Loose Java classes and miscellaneous files to be included.
+  # config.java_classes = FileList["target/classes/**.*"]
+  # One or more pathmaps defining how the java classes should be copied into
+  # the archive. The example pathmap below accompanies the java_classes
+  # configuration above. See http://rake.rubyforge.org/classes/String.html#M000017
+  # for details of how to specify a pathmap.
+  # config.pathmaps.java_classes << "%{target/classes/,}p"
+  # Bundler support is built-in. If Warbler finds a Gemfile in the
+  # project directory, it will be used to collect the gems to bundle
+  # in your application. If you wish to explicitly disable this
+  # functionality, uncomment here.
+  # config.bundler = false
+  # An array of Bundler groups to avoid including in the war file.
+  # Defaults to ["development", "test", "assets"].
+  # config.bundle_without = []
+  # Other gems to be included. If you don't use Bundler or a gemspec
+  # file, you need to tell Warbler which gems your application needs
+  # so that they can be packaged in the archive.
+  # For Rails applications, the Rails gems are included by default
+  # unless the vendor/rails directory is present.
+  # config.gems += ["activerecord-jdbcmysql-adapter", "jruby-openssl"]
+  # config.gems << "tzinfo"
+  # Uncomment this if you don't want to package rails gem.
+  # config.gems -= ["rails"]
+  # The most recent versions of gems are used.
+  # You can specify versions of gems by using a hash assignment:
+  # config.gems["rails"] = "2.3.10"
+  # You can also use regexps or Gem::Dependency objects for flexibility or
+  # finer-grained control.
+  # config.gems << /^merb-/
+  # config.gems << Gem::Dependency.new("merb-core", "= 0.9.3")
+  # Include gem dependencies not mentioned specifically. Default is
+  # true, uncomment to turn off.
+  # config.gem_dependencies = false
+  # Array of regular expressions matching relative paths in gems to be
+  # excluded from the war. Defaults to empty, but you can set it like
+  # below, which excludes test files.
+  # config.gem_excludes = [/^(test|spec)\//]
+  # Pathmaps for controlling how application files are copied into the archive
+  # config.pathmaps.application = ["WEB-INF/%p"]
+  # Name of the archive (without the extension). Defaults to the basename
+  # of the project directory.
+  # config.jar_name = "mywar"
+  # Name of the MANIFEST.MF template for the war file. Defaults to a simple
+  # MANIFEST.MF that contains the version of Warbler used to create the war file.
+  # config.manifest_file = "config/MANIFEST.MF"
+  # When using the 'compiled' feature and specified, only these Ruby
+  # files will be compiled. Default is to compile all \.rb files in
+  # the application.
+  # config.compiled_ruby_files = FileList['app/**/*.rb']
+  # When set to true, Warbler will override the value of ENV['GEM_HOME'] even it
+  # has already been set. When set to false it will use any existing value of
+  # GEM_HOME if it is set.
+  # config.override_gem_home = true
+  # Allows for specifing custom executables
+  # config.executable = ["rake", "bin/rake"]
+  # Sets default (prefixed) parameters for the executables
+  # config.executable_params = "do:something"
+  # === War files only below here ===
+  # Path to the pre-bundled gem directory inside the war file. Default
+  # is 'WEB-INF/gems'. Specify path if gems are already bundled
+  # before running Warbler. This also sets 'gem.path' inside web.xml.
+  # config.gem_path = "WEB-INF/vendor/bundler_gems"
+  # Files for WEB-INF directory (next to web.xml). This contains
+  # web.xml by default. If there is an .erb-File it will be processed
+  # with webxml-config. You may want to exclude this file via
+  # config.excludes.
+  # config.webinf_files += FileList["jboss-web.xml"]
+  # Files to be included in the root of the webapp.  Note that files in public
+  # will have the leading 'public/' part of the path stripped during staging.
+  # config.public_html = FileList["public/**/*", "doc/**/*"]
+  # Pathmaps for controlling how public HTML files are copied into the .war
+  # config.pathmaps.public_html = ["%{public/,}p"]
+  # Embedded webserver to use with the 'executable' feature. Currently supported
+  # webservers are:
+  # * <tt>winstone</tt> (default) - Winstone 0.9.10 from sourceforge
+  # * <tt>jenkins-ci.winstone</tt> - Improved Winstone from Jenkins CI
+  # * <tt>jetty</tt> - Embedded Jetty from Eclipse
+  # config.webserver = 'jetty'
+  # Value of RAILS_ENV for the webapp -- default as shown below
+  # config.webxml.rails.env = ENV['RAILS_ENV'] || 'production'
+  # Application booter to use, one of :rack, :rails, or :merb (autodetected by default)
+  # config.webxml.booter = :rails
+  # Set JRuby to run in 1.9 mode.
+  # config.webxml.jruby.compat.version = "1.9"
+  # When using the :rack booter, "Rackup" script to use.
+  # - For 'rackup.path', the value points to the location of the rackup
+  # script in the web archive file. You need to make sure this file
+  # gets included in the war, possibly by adding it to config.includes
+  # or config.webinf_files above.
+  # - For 'rackup', the rackup script you provide as an inline string
+  #   is simply embedded in web.xml.
+  # The script is evaluated in a Rack::Builder to load the application.
+  # Examples:
+  # config.webxml.rackup.path = 'WEB-INF/hello.ru'
+  # config.webxml.rackup = %{require './lib/demo'; run Rack::Adapter::Camping.new(Demo)}
+  # config.webxml.rackup = require 'cgi' && CGI::escapeHTML(File.read("config.ru"))
+  # Control the pool of Rails runtimes. Leaving unspecified means
+  # the pool will grow as needed to service requests. It is recommended
+  # that you fix these values when running a production server!
+  # If you're using threadsafe! mode, you probably don't want to set these values,
+  # since 1 runtime(default for threadsafe mode) will be enough.
+  # config.webxml.jruby.min.runtimes = 2
+  # config.webxml.jruby.max.runtimes = 4
+  # JNDI data source name
+  # config.webxml.jndi = 'jdbc/rails'
+end

data/config.ru ADDED Viewed

	@@ -0,0 +1,2 @@
1	+ require 'llt/segmenter/api'
2	+ run Api

data/lib/llt/segmenter/api.rb ADDED Viewed

@@ -0,0 +1,20 @@
+require 'sinatra/base'
+require 'sinatra/respond_with'
+require 'llt/segmenter'
+require 'llt/core/api'
+class Api < Sinatra::Base
+  register Sinatra::RespondWith
+  helpers LLT::Core::Api::Helpers
+  get '/segment' do
+    typecast_params!(params)
+    text = extract_text(params)
+    segmenter = LLT::Segmenter.new(params)
+    sentences = segmenter.segment(text)
+    respond_to do |f|
+      f.xml { to_xml(sentences, params) }
+    end
+  end
+end

data/lib/llt/segmenter/version.rb ADDED Viewed

@@ -0,0 +1,5 @@
+module LLT
+  class Segmenter
+    VERSION = "0.0.1"
+  end
+end

data/lib/llt/segmenter.rb ADDED Viewed

@@ -0,0 +1,97 @@
+require "llt/constants"
+require "llt/core"
+require "llt/logger"
+require "llt/sentence"
+module LLT
+  class Segmenter
+    include Constants::Abbreviations
+    include Core::Serviceable
+    uses_logger { Logger.new('Segmenter', default: :debug) }
+    def self.default_options
+      {
+        indexing: true,
+        newline_boundary: 2
+      }
+    end
+    # Abbreviations with boundary e.g. \bA
+    #
+    # This doesn't work in jruby (opened an issue at jruby/jruby#1269 ),
+    # so we have to change things as long as this is not fixed.
+    #
+    # (?<=\s|^) can be just \b in MRI 2.0 and upwards
+    AWB = ALL_ABBRS_PIPED.split('|').map { |abbr| "(?<=\\s|^)#{abbr}" }.join('|')
+    SENTENCE_CLOSER = /(?<!#{AWB})\.(?!\.)|[;\?!:]/
+    DIRECT_SPEECH_DELIMITER = /['"”]/
+    TRAILERS = /\)|<\/.*?>/
+    def segment(string, add_to: nil, **options)
+      setup(options)
+      # dump whitespace at the beginning and end!
+      string.strip!
+      sentences = scan_through_string(StringScanner.new(string))
+      add_to << sentences if add_to.respond_to?(:<<)
+      sentences
+    end
+    private
+    def setup(options)
+      @indexing = parse_option(:indexing, options)
+      @id = 0 if @indexing
+      nl_boundary  = parse_option(:newline_boundary, options)
+      @sentence_closer = Regexp.union(SENTENCE_CLOSER, /\n{#{nl_boundary}}/)
+    end
+    def scan_through_string(scanner, sentences = [])
+      while scanner.rest?
+        sentence = scanner.scan_until(@sentence_closer) ||
+          rescue_no_delimiters(sentences, scanner)
+        sentence << trailing_delimiters(scanner)
+        sentence.strip!
+        unless sentence.empty?
+          curr_id = id
+          @logger.log("Segmented #{curr_id} #{sentence}")
+          sentences << Sentence.new(sentence, curr_id)
+        end
+      end
+      sentences
+    end
+    def id
+      if @indexing
+        @id += 1
+      end
+    end
+    def rescue_no_delimiters(sentences, scanner)
+      if sentences.any?
+        # broken off texts
+        scanner.scan_until(/$/)
+      else
+        # try a simple newline as delimiter, if there was no delimiter
+        scanner.reset
+        @sentence_closer = /\n/
+        if sent = scanner.scan_until(@sentence_closer)
+          sent
+        else
+          # when there is not even a new line, return all input
+          scanner.terminate
+          scanner.string
+        end
+      end
+    end
+    def trailing_delimiters(scanner)
+      trailers = [DIRECT_SPEECH_DELIMITER, TRAILERS]
+      trailers.each_with_object('') do |trailer, str|
+        str << scanner.scan(trailer).to_s # catches nil
+      end
+    end
+  end
+end

data/lib/llt/sentence.rb ADDED Viewed

@@ -0,0 +1,10 @@
+require 'llt/core/containable'
+module LLT
+  class Sentence
+    include Core::Containable
+    xml_tag 's'
+    container_alias :tokens
+  end
+end

data/llt-segmenter.gemspec ADDED Viewed

@@ -0,0 +1,29 @@
+# coding: utf-8
+lib = File.expand_path('../lib', __FILE__)
+$LOAD_PATH.unshift(lib) unless $LOAD_PATH.include?(lib)
+require 'llt/segmenter/version'
+Gem::Specification.new do |spec|
+  spec.name          = "llt-segmenter"
+  spec.version       = LLT::Segmenter::VERSION
+  spec.authors       = ["Gernot Höflechner, Robert Lichstensteiner, Christof Sirk"]
+  spec.email         = ["latin.language.toolkit@gmail.com"]
+  spec.description   = %q{Segments text into sentences}
+  spec.summary       = %q{Segments text into sentences}
+  spec.homepage      = "http://latin-language-toolkit.net"
+  spec.license       = "MIT"
+  spec.files         = `git ls-files`.split($/)
+  spec.executables   = spec.files.grep(%r{^bin/}) { |f| File.basename(f) }
+  spec.test_files    = spec.files.grep(%r{^(test|spec|features)/})
+  spec.require_paths = ["lib"]
+  spec.add_development_dependency "bundler", "~> 1.3"
+  spec.add_development_dependency "rake"
+  spec.add_development_dependency "rspec"
+  spec.add_development_dependency "simplecov", "~> 0.7"
+  spec.add_dependency "warbler"
+  #spec.add_dependency "llt-core"
+  #spec.add_dependency "llt-constants"
+  #spec.add_dependency "llt-logger"
+end

data/spec/lib/llt/segmenter/api_spec.rb ADDED Viewed

@@ -0,0 +1,56 @@
+ENV['RACK_ENV'] = 'test'
+require 'spec_helper'
+require 'llt/segmenter/api'
+require 'rack/test'
+def app
+  Api
+end
+describe "segmenter api" do
+  include Rack::Test::Methods
+  describe '/segment' do
+    context "with URI as input" do
+    end
+    let(:text) {{text: "homo mittit. Marcus est."}}
+    context "with text as input" do
+      context "with accept header json" do
+        it "segments the given sentences" do
+          pending
+          get '/segment', text,
+            {"HTTP_ACCEPT" => "application/json"}
+          last_response.should be_ok
+          response = last_response.body
+          parsed_response = JSON.parse(response)
+          parsed_response.should have(2).items
+        end
+      end
+      context "with accept header xml" do
+        it "segments the given sentences" do
+          get '/segment', text,
+            {"HTTP_ACCEPT" => "application/xml"}
+          last_response.should be_ok
+          body = last_response.body
+          body.should =~ /<s n="1">homo mittit\.<\/s>/
+          body.should =~ /<s n="2">Marcus est\.<\/s>/
+        end
+        it "receives params for segmentation and markup" do
+          params = { indexing: false }.merge(text)
+          get '/segment', params,
+            {"HTTP_ACCEPT" => "application/xml"}
+          last_response.should be_ok
+          body = last_response.body
+          body.should =~ /<s>homo mittit\.<\/s>/
+          body.should =~ /<s>Marcus est\.<\/s>/
+        end
+      end
+    end
+  end
+end

data/spec/lib/llt/segmenter_spec.rb ADDED Viewed

@@ -0,0 +1,259 @@
+require 'spec_helper'
+describe LLT::Segmenter do
+  let(:segmenter) { LLT::Segmenter.new }
+  describe "#segment" do
+    it "returns an array of LLT::Sentence elements" do
+      sentences = segmenter.segment("est.")
+      sentences.should have(1).item
+      sentences.first.should be_a LLT::Sentence
+    end
+    it "segments a paragraph of into sentences - easy" do
+      txt = "Cicero est. Caesar est."
+      sentences = segmenter.segment(txt)
+      sentences.should have(2).items
+      sentences[0].to_s.should == "Cicero est."
+      sentences[1].to_s.should == "Caesar est."
+    end
+    it "segments a paragraph of into sentences - complex" do
+      txt = "Cicero est; quis Caesar est? Marcus Antonius!"
+      sentences = segmenter.segment(txt)
+      sentences.should have(3).items
+      sentences[0].to_s.should == "Cicero est;"
+      sentences[1].to_s.should == "quis Caesar est?"
+      sentences[2].to_s.should == "Marcus Antonius!"
+    end
+    it "creates indices by default" do
+      txt = "Cicero est; quis Caesar est? Marcus Antonius!"
+      sentences = segmenter.segment(txt)
+      sentences.map(&:id).should == [1, 2, 3]
+    end
+    it "indices can be turned off" do
+      txt = "Cicero est; quis Caesar est? Marcus Antonius!"
+      sentences = segmenter.segment(txt, indexing: false)
+      sentences.map(&:id).should == [nil, nil, nil]
+    end
+    it "handles abbreviated names" do
+      txt = "C. Caesar est. M. Tullius Cicero est."
+      sentences = segmenter.segment(txt)
+      sentences.should have(2).items
+      sentences[0].to_s.should == "C. Caesar est."
+      sentences[1].to_s.should == "M. Tullius Cicero est."
+    end
+    it "handles abbreviated dates" do
+      txt = "Is dies erat a. d. V Kal. Apr. L. Pisone, A. Gabinio consulibus."
+      sentences = segmenter.segment(txt)
+      sentences.should have(1).item
+    end
+    it "handles more dates" do
+      txt = "Is dies erat a. d. V Ian. Non. Feb. L. App. Pisone ."
+      sentences = segmenter.segment(txt)
+      puts sentences
+      sentences.should have(1).item
+    end
+    it "are only triggered when they have a leading word boundary" do
+      # spec might seem strange, but this didn't work from the start on
+      txt = "erat nauta. est."
+      sentences = segmenter.segment(txt)
+      sentences.should have(2).items
+    end
+    it "handles dates even with numbers that have an abbr dot" do
+      pending('Not solved yet. Think of M.') do
+        txt = "Is dies erat a. d. V. Kal. Apr. L. Pisone, A. Gabinio consulibus."
+        sentences = segmenter.segment(txt)
+        sentences.should have(1).item
+      end
+    end
+    it "splits at :" do
+      txt = 'iubent: fugere manus.'
+      sentences = segmenter.segment(txt)
+      sentences.should have(2).items
+    end
+    it "doesn't create empty sentences" do
+      txt = "text.\n\n\ntext."
+      sentences = segmenter.segment(txt)
+      sentences.should have(2).items
+    end
+    context "with embedded xml" do
+      it "doesn't break up before xml closing tags" do
+        txt = '<grc> text.</grc>'
+        sentences = segmenter.segment(txt)
+        sentences.should have(1).item
+      end
+    end
+    context "newline (\\n) handling" do
+      it "works when in between" do
+        txt = "Filia est.\nFilius est."
+        sentences = segmenter.segment(txt)
+        sentences.should have(2).items
+        sentences[0].to_s.should == "Filia est."
+        sentences[1].to_s.should == "Filius est."
+      end
+      it "works when at the end of a text" do
+        sentences = segmenter.segment("Marcus est.\n")
+        sentences.should have(1).item
+        sentences.first.to_s.should == 'Marcus est.'
+      end
+      it "works with newline and space in between and no new line at the end" do
+        txt = "Fīlius rēgīnae erat.\n Rēgīnam aurō dōnābunt."
+        sentences = segmenter.segment(txt)
+        sentences.should have(2).items
+        sentences[0].to_s.should == "Fīlius rēgīnae erat."
+        sentences[1].to_s.should == "Rēgīnam aurō dōnābunt."
+      end
+      it "works with newline and space in between and new line at the end" do
+        txt = "Fīlius rēgīnae erat nauta.\n Rēgīnam aurō dōnābunt.\n"
+        sentences = segmenter.segment(txt)
+        sentences.should have(2).items
+        sentences[0].to_s.should == "Fīlius rēgīnae erat nauta."
+        sentences[1].to_s.should == "Rēgīnam aurō dōnābunt."
+      end
+      it "treats an empty line as delimiter - might e.g. appear in book titles" do
+        txt = "Marcus est\n\nMarcus est."
+        sentences = segmenter.segment(txt)
+        sentences.should have(2).item
+      end
+      it "number of newlines that count as sentence boundary can be given as option" do
+        txt1 = "Marcus est\n\nMarcus est."
+        txt2 = "Marcus est\n\n\nMarcus est."
+        sentences1 = segmenter.segment(txt1, newline_boundary: 3)
+        sentences2 = segmenter.segment(txt2, newline_boundary: 3)
+        sentences1.should have(1).item
+        sentences2.should have(2).item
+      end
+    end
+    it "handles quantified texts" do
+      txt = "Fēmina puellae pecūniam dabat.\n Fīlia poētae in viīs errābat.\n"
+      sentences = segmenter.segment(txt)
+      sentences.should have(2).item
+    end
+    it "is not disturbed by leading or trailing whitespace" do
+      txt = '   Marcus est. Marcus est.   '
+      sentences = segmenter.segment(txt)
+      sentences.should have(2).item
+    end
+    context "with ellipsis punctuation" do
+      it "handles them at the end of a sentence" do
+        txt = 'Marcus ...'
+        sentences = segmenter.segment(txt)
+        sentences.should have(1).item
+      end
+      it "handles them in the midst of a sentence" do
+        pending 'Tough to do'
+      end
+    end
+    context "direct speech delimiter" do
+      context "with '" do
+        it "handles basic cases when on the outside of the punctuation" do
+          txt = "'Marcus est.'"
+          sentences = segmenter.segment(txt)
+          sentences.should have(1).item
+        end
+        it "handles basic cases when on the inside of the punctuation" do
+          txt = "'Marcus est'?"
+          sentences = segmenter.segment(txt)
+          sentences.should have(1).item
+        end
+      end
+      context 'with "' do
+        it "handles basic cases when on the outside of the punctuation" do
+          txt = '"Marcus est."'
+          sentences = segmenter.segment(txt)
+          sentences.should have(1).item
+        end
+        it "handles basic cases when on the inside of the punctuation" do
+          txt = '"Marcus est"?'
+          sentences = segmenter.segment(txt)
+          sentences.should have(1).item
+        end
+      end
+      context 'with ” (attention: this is NOT the same as "' do
+        it "handles basic cases when on the outside of the punctuation" do
+          txt = '”Marcus est.”'
+          sentences = segmenter.segment(txt)
+          sentences.should have(1).item
+        end
+        it "handles basic cases when on the inside of the punctuation" do
+          txt = '”Marcus est”?'
+          sentences = segmenter.segment(txt)
+          sentences.should have(1).item
+        end
+      end
+    end
+    it "catches trailing parenthesis" do
+      txt = "Marcus est. (Marcus est.) Marcus est."
+      sentences = segmenter.segment(txt)
+      sentences.should have(3).items
+      sentences[0].to_s.should == 'Marcus est.'
+      sentences[1].to_s.should == '(Marcus est.)'
+      sentences[2].to_s.should == 'Marcus est.'
+    end
+    it "handles broken off texts - the rest is an own sentence" do
+      txt = "Marcus est. Marcus est"
+      sentences = segmenter.segment(txt)
+      sentences.should have(2).item
+    end
+    context "with no delimiters present" do
+      it "tries to fallback to single newline boundary" do
+        txt = "Marcus est\nMarcus est"
+        segmenter.segment(txt).should have(2).items
+      end
+      it "returns the whole input as segment when there are no newlines" do
+        txt = "Marcus est"
+        segmenter.segment(txt).should have(1).item
+      end
+    end
+    describe "takes an optional keyword argument add_to" do
+      class ParagraphDummy
+        attr_reader :sentences
+        def initialize; @sentences = []; end
+        def <<(sentences); @sentences += sentences; end
+      end
+      it "adds the result to the given object if #<< is implemented" do
+        paragraph = ParagraphDummy.new
+        s = segmenter.segment("", add_to: paragraph)
+        paragraph.sentences.should == s
+      end
+      it "does nothing to the given object when #<< it does not respond to" do
+        object = double(respond_to?: false)
+        object.should_not receive(:<<)
+        segmenter.segment("", add_to: object)
+      end
+    end
+  end
+end

data/spec/spec_helper.rb ADDED Viewed

@@ -0,0 +1,26 @@
+require 'simplecov'
+require 'coveralls'
+Coveralls.wear!
+SimpleCov.formatter = SimpleCov::Formatter::MultiFormatter[
+  SimpleCov::Formatter::HTMLFormatter,
+  Coveralls::SimpleCov::Formatter
+]
+SimpleCov.start do
+  add_filter '/spec/'
+end
+$LOAD_PATH.unshift File.expand_path('../../lib', __FILE__)
+require 'llt/segmenter'
+if defined?(LLT::Logger)
+  LLT::Logger.level = nil
+end
+RSpec.configure do |config|
+  config.treat_symbols_as_metadata_keys_with_true_values = true
+  config.run_all_when_everything_filtered = true
+  config.filter_run :focus
+end

metadata ADDED Viewed

@@ -0,0 +1,134 @@
+--- !ruby/object:Gem::Specification
+name: llt-segmenter
+version: !ruby/object:Gem::Version
+  version: 0.0.1
+platform: ruby
+authors:
+- Gernot Höflechner, Robert Lichstensteiner, Christof Sirk
+autorequire:
+bindir: bin
+cert_chain: []
+date: 2013-12-08 00:00:00.000000000 Z
+dependencies:
+- !ruby/object:Gem::Dependency
+  name: bundler
+  requirement: !ruby/object:Gem::Requirement
+    requirements:
+    - - "~>"
+      - !ruby/object:Gem::Version
+        version: '1.3'
+  type: :development
+  prerelease: false
+  version_requirements: !ruby/object:Gem::Requirement
+    requirements:
+    - - "~>"
+      - !ruby/object:Gem::Version
+        version: '1.3'
+- !ruby/object:Gem::Dependency
+  name: rake
+  requirement: !ruby/object:Gem::Requirement
+    requirements:
+    - - ">="
+      - !ruby/object:Gem::Version
+        version: '0'
+  type: :development
+  prerelease: false
+  version_requirements: !ruby/object:Gem::Requirement
+    requirements:
+    - - ">="
+      - !ruby/object:Gem::Version
+        version: '0'
+- !ruby/object:Gem::Dependency
+  name: rspec
+  requirement: !ruby/object:Gem::Requirement
+    requirements:
+    - - ">="
+      - !ruby/object:Gem::Version
+        version: '0'
+  type: :development
+  prerelease: false
+  version_requirements: !ruby/object:Gem::Requirement
+    requirements:
+    - - ">="
+      - !ruby/object:Gem::Version
+        version: '0'
+- !ruby/object:Gem::Dependency
+  name: simplecov
+  requirement: !ruby/object:Gem::Requirement
+    requirements:
+    - - "~>"
+      - !ruby/object:Gem::Version
+        version: '0.7'
+  type: :development
+  prerelease: false
+  version_requirements: !ruby/object:Gem::Requirement
+    requirements:
+    - - "~>"
+      - !ruby/object:Gem::Version
+        version: '0.7'
+- !ruby/object:Gem::Dependency
+  name: warbler
+  requirement: !ruby/object:Gem::Requirement
+    requirements:
+    - - ">="
+      - !ruby/object:Gem::Version
+        version: '0'
+  type: :runtime
+  prerelease: false
+  version_requirements: !ruby/object:Gem::Requirement
+    requirements:
+    - - ">="
+      - !ruby/object:Gem::Version
+        version: '0'
+description: Segments text into sentences
+email:
+- latin.language.toolkit@gmail.com
+executables: []
+extensions: []
+extra_rdoc_files: []
+files:
+- ".gitignore"
+- ".rspec"
+- ".travis.yml"
+- Gemfile
+- LICENSE
+- README.md
+- Rakefile
+- config.ru
+- config/warble.rb
+- lib/llt/segmenter.rb
+- lib/llt/segmenter/api.rb
+- lib/llt/segmenter/version.rb
+- lib/llt/sentence.rb
+- llt-segmenter.gemspec
+- spec/lib/llt/segmenter/api_spec.rb
+- spec/lib/llt/segmenter_spec.rb
+- spec/spec_helper.rb
+homepage: http://latin-language-toolkit.net
+licenses:
+- MIT
+metadata: {}
+post_install_message:
+rdoc_options: []
+require_paths:
+- lib
+required_ruby_version: !ruby/object:Gem::Requirement
+  requirements:
+  - - ">="
+    - !ruby/object:Gem::Version
+      version: '0'
+required_rubygems_version: !ruby/object:Gem::Requirement
+  requirements:
+  - - ">="
+    - !ruby/object:Gem::Version
+      version: '0'
+requirements: []
+rubyforge_project:
+rubygems_version: 2.1.5
+signing_key:
+specification_version: 4
+summary: Segments text into sentences
+test_files:
+- spec/lib/llt/segmenter/api_spec.rb
+- spec/lib/llt/segmenter_spec.rb
+- spec/spec_helper.rb