llt-segmenter 0.0.1

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml ADDED
@@ -0,0 +1,7 @@
1
+ ---
2
+ SHA1:
3
+ metadata.gz: 4759c4666499718fd9364ff782a16f5d7ef818d1
4
+ data.tar.gz: cb27484995b057c79017ac11740dc68284f0085b
5
+ SHA512:
6
+ metadata.gz: abb24d73dc6029e91bb5ba14d81fcd0244ec7cc1619d7848bddff4df785c38dcd1c069877e368930289a248e3e79a905a3d4771e7b7fa6b7087c33427c04d54d
7
+ data.tar.gz: dd8daa449bb083a62ebd6656c6c6221e2af9c36cb0d22fdd95775b8e781f312125857380abc43a854c6e282cda797f77331afe65f0fd5b2f04d6d5e4d417b389
data/.gitignore ADDED
@@ -0,0 +1,17 @@
1
+ *.gem
2
+ *.rbc
3
+ .bundle
4
+ .config
5
+ .yardoc
6
+ Gemfile.lock
7
+ InstalledFiles
8
+ _yardoc
9
+ coverage
10
+ doc/
11
+ lib/bundler/man
12
+ pkg
13
+ rdoc
14
+ spec/reports
15
+ test/tmp
16
+ test/version_tmp
17
+ tmp
data/.rspec ADDED
@@ -0,0 +1,3 @@
1
+ --format documentation
2
+ --color
3
+ --tty
data/.travis.yml ADDED
@@ -0,0 +1,6 @@
1
+ language: ruby
2
+ before_script:
3
+ - "export JRUBY_OPTS=--2.0"
4
+ rvm:
5
+ - 2.0.0
6
+ - jruby-20mode
data/Gemfile ADDED
@@ -0,0 +1,15 @@
1
+ source 'https://rubygems.org'
2
+
3
+ # Specify your gem's dependencies in llt-segmenter.gemspec
4
+ gemspec
5
+ gem 'pry'
6
+
7
+ gem 'coveralls', require: false
8
+
9
+ gem 'llt-core', git: 'https://github.com/latin-language-toolkit/llt-core.git'
10
+ gem 'llt-constants', git: 'https://github.com/latin-language-toolkit/llt-constants.git'
11
+ gem 'llt-logger', git: 'https://github.com/latin-language-toolkit/llt-logger.git'
12
+
13
+ platform :jruby do
14
+ gem 'jruby-httpclient'
15
+ end
data/LICENSE ADDED
@@ -0,0 +1,20 @@
1
+ The MIT License (MIT)
2
+
3
+ Copyright (c) 2013 latin-language-toolkit
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining a copy of
6
+ this software and associated documentation files (the "Software"), to deal in
7
+ the Software without restriction, including without limitation the rights to
8
+ use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of
9
+ the Software, and to permit persons to whom the Software is furnished to do so,
10
+ subject to the following conditions:
11
+
12
+ The above copyright notice and this permission notice shall be included in all
13
+ copies or substantial portions of the Software.
14
+
15
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS
17
+ FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR
18
+ COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER
19
+ IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
20
+ CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
data/README.md ADDED
@@ -0,0 +1,45 @@
1
+ # LLT::Segmenter
2
+
3
+ [![Version](http://allthebadges.io/latin-language-toolkit/llt-segmenter/badge_fury.png)](http://allthebadges.io/latin-language-toolkit/llt-segmenter/badge_fury)
4
+ [![Build Status](https://travis-ci.org/latin-language-toolkit/llt-segmenter.png?branch=master)](https://travis-ci.org/latin-language-toolkit/llt-segmenter)
5
+ [![Dependencies](http://allthebadges.io/latin-language-toolkit/llt-segmenter/gemnasium.png)](http://allthebadges.io/latin-language-toolkit/llt-segmenter/gemnasium)
6
+ [![Coverage](https://coveralls.io/repos/latin-language-toolkit/llt-segmenter/badge.png?branch=master)](https://coveralls.io/r/latin-language-toolkit/llt-segmenter?branch=master)
7
+ [![Code Climate](https://codeclimate.com/github/latin-language-toolkit/llt-segmenter.png)](https://codeclimate.com/github/latin-language-toolkit/llt-segmenter)
8
+
9
+ Segments text into sentences.
10
+
11
+ ## Installation
12
+
13
+ Add this line to your application's Gemfile:
14
+
15
+ gem 'llt-segmenter'
16
+
17
+ And then execute:
18
+
19
+ $ bundle
20
+
21
+ Or install it yourself as:
22
+
23
+ $ gem install llt-segmenter
24
+
25
+ ## Usage
26
+
27
+ TODO: Write usage instructions here
28
+
29
+ ## API
30
+ This currently is a list of requirements and will transform into an API documentation.
31
+
32
+ Input:
33
+ - Text or (URI)
34
+ - Black-/Whitelist for separators.
35
+
36
+ Output:
37
+ - XML (TEI) or JSON
38
+
39
+ ## Contributing
40
+
41
+ 1. Fork it
42
+ 2. Create your feature branch (`git checkout -b my-new-feature`)
43
+ 3. Commit your changes (`git commit -am 'Add some feature'`)
44
+ 4. Push to the branch (`git push origin my-new-feature`)
45
+ 5. Create new Pull Request
data/Rakefile ADDED
@@ -0,0 +1,7 @@
1
+ require "bundler/gem_tasks"
2
+ require "rspec/core/rake_task"
3
+
4
+ RSpec::Core::RakeTask.new(:spec)
5
+
6
+ task :default => :spec
7
+
data/config/warble.rb ADDED
@@ -0,0 +1,162 @@
1
+ # Disable Rake-environment-task framework detection by uncommenting/setting to false
2
+ # Warbler.framework_detection = false
3
+
4
+ # Warbler web application assembly configuration file
5
+ Warbler::Config.new do |config|
6
+ # Features: additional options controlling how the jar is built.
7
+ # Currently the following features are supported:
8
+ # - gemjar: package the gem repository in a jar file in WEB-INF/lib
9
+ # - executable: embed a web server and make the war executable
10
+ # - compiled: compile .rb files to .class files
11
+ # config.features = %w(gemjar)
12
+
13
+ # Application directories to be included in the webapp.
14
+ # config.dirs = %w(app config db lib log script vendor tmp)
15
+
16
+ # Additional files/directories to include, above those in config.dirs
17
+ # config.includes = FileList["db"]
18
+
19
+ # Additional files/directories to exclude
20
+ # config.excludes = FileList["lib/tasks/*"]
21
+
22
+ # Additional Java .jar files to include. Note that if .jar files are placed
23
+ # in lib (and not otherwise excluded) then they need not be mentioned here.
24
+ # JRuby and JRuby-Rack are pre-loaded in this list. Be sure to include your
25
+ # own versions if you directly set the value
26
+ # config.java_libs += FileList["lib/java/*.jar"]
27
+
28
+ # Loose Java classes and miscellaneous files to be included.
29
+ # config.java_classes = FileList["target/classes/**.*"]
30
+
31
+ # One or more pathmaps defining how the java classes should be copied into
32
+ # the archive. The example pathmap below accompanies the java_classes
33
+ # configuration above. See http://rake.rubyforge.org/classes/String.html#M000017
34
+ # for details of how to specify a pathmap.
35
+ # config.pathmaps.java_classes << "%{target/classes/,}p"
36
+
37
+ # Bundler support is built-in. If Warbler finds a Gemfile in the
38
+ # project directory, it will be used to collect the gems to bundle
39
+ # in your application. If you wish to explicitly disable this
40
+ # functionality, uncomment here.
41
+ # config.bundler = false
42
+
43
+ # An array of Bundler groups to avoid including in the war file.
44
+ # Defaults to ["development", "test", "assets"].
45
+ # config.bundle_without = []
46
+
47
+ # Other gems to be included. If you don't use Bundler or a gemspec
48
+ # file, you need to tell Warbler which gems your application needs
49
+ # so that they can be packaged in the archive.
50
+ # For Rails applications, the Rails gems are included by default
51
+ # unless the vendor/rails directory is present.
52
+ # config.gems += ["activerecord-jdbcmysql-adapter", "jruby-openssl"]
53
+ # config.gems << "tzinfo"
54
+
55
+ # Uncomment this if you don't want to package rails gem.
56
+ # config.gems -= ["rails"]
57
+
58
+ # The most recent versions of gems are used.
59
+ # You can specify versions of gems by using a hash assignment:
60
+ # config.gems["rails"] = "2.3.10"
61
+
62
+ # You can also use regexps or Gem::Dependency objects for flexibility or
63
+ # finer-grained control.
64
+ # config.gems << /^merb-/
65
+ # config.gems << Gem::Dependency.new("merb-core", "= 0.9.3")
66
+
67
+ # Include gem dependencies not mentioned specifically. Default is
68
+ # true, uncomment to turn off.
69
+ # config.gem_dependencies = false
70
+
71
+ # Array of regular expressions matching relative paths in gems to be
72
+ # excluded from the war. Defaults to empty, but you can set it like
73
+ # below, which excludes test files.
74
+ # config.gem_excludes = [/^(test|spec)\//]
75
+
76
+ # Pathmaps for controlling how application files are copied into the archive
77
+ # config.pathmaps.application = ["WEB-INF/%p"]
78
+
79
+ # Name of the archive (without the extension). Defaults to the basename
80
+ # of the project directory.
81
+ # config.jar_name = "mywar"
82
+
83
+ # Name of the MANIFEST.MF template for the war file. Defaults to a simple
84
+ # MANIFEST.MF that contains the version of Warbler used to create the war file.
85
+ # config.manifest_file = "config/MANIFEST.MF"
86
+
87
+ # When using the 'compiled' feature and specified, only these Ruby
88
+ # files will be compiled. Default is to compile all \.rb files in
89
+ # the application.
90
+ # config.compiled_ruby_files = FileList['app/**/*.rb']
91
+
92
+ # When set to true, Warbler will override the value of ENV['GEM_HOME'] even it
93
+ # has already been set. When set to false it will use any existing value of
94
+ # GEM_HOME if it is set.
95
+ # config.override_gem_home = true
96
+
97
+ # Allows for specifing custom executables
98
+ # config.executable = ["rake", "bin/rake"]
99
+
100
+ # Sets default (prefixed) parameters for the executables
101
+ # config.executable_params = "do:something"
102
+
103
+ # === War files only below here ===
104
+
105
+ # Path to the pre-bundled gem directory inside the war file. Default
106
+ # is 'WEB-INF/gems'. Specify path if gems are already bundled
107
+ # before running Warbler. This also sets 'gem.path' inside web.xml.
108
+ # config.gem_path = "WEB-INF/vendor/bundler_gems"
109
+
110
+ # Files for WEB-INF directory (next to web.xml). This contains
111
+ # web.xml by default. If there is an .erb-File it will be processed
112
+ # with webxml-config. You may want to exclude this file via
113
+ # config.excludes.
114
+ # config.webinf_files += FileList["jboss-web.xml"]
115
+
116
+ # Files to be included in the root of the webapp. Note that files in public
117
+ # will have the leading 'public/' part of the path stripped during staging.
118
+ # config.public_html = FileList["public/**/*", "doc/**/*"]
119
+
120
+ # Pathmaps for controlling how public HTML files are copied into the .war
121
+ # config.pathmaps.public_html = ["%{public/,}p"]
122
+
123
+ # Embedded webserver to use with the 'executable' feature. Currently supported
124
+ # webservers are:
125
+ # * <tt>winstone</tt> (default) - Winstone 0.9.10 from sourceforge
126
+ # * <tt>jenkins-ci.winstone</tt> - Improved Winstone from Jenkins CI
127
+ # * <tt>jetty</tt> - Embedded Jetty from Eclipse
128
+ # config.webserver = 'jetty'
129
+
130
+ # Value of RAILS_ENV for the webapp -- default as shown below
131
+ # config.webxml.rails.env = ENV['RAILS_ENV'] || 'production'
132
+
133
+ # Application booter to use, one of :rack, :rails, or :merb (autodetected by default)
134
+ # config.webxml.booter = :rails
135
+
136
+ # Set JRuby to run in 1.9 mode.
137
+ # config.webxml.jruby.compat.version = "1.9"
138
+
139
+ # When using the :rack booter, "Rackup" script to use.
140
+ # - For 'rackup.path', the value points to the location of the rackup
141
+ # script in the web archive file. You need to make sure this file
142
+ # gets included in the war, possibly by adding it to config.includes
143
+ # or config.webinf_files above.
144
+ # - For 'rackup', the rackup script you provide as an inline string
145
+ # is simply embedded in web.xml.
146
+ # The script is evaluated in a Rack::Builder to load the application.
147
+ # Examples:
148
+ # config.webxml.rackup.path = 'WEB-INF/hello.ru'
149
+ # config.webxml.rackup = %{require './lib/demo'; run Rack::Adapter::Camping.new(Demo)}
150
+ # config.webxml.rackup = require 'cgi' && CGI::escapeHTML(File.read("config.ru"))
151
+
152
+ # Control the pool of Rails runtimes. Leaving unspecified means
153
+ # the pool will grow as needed to service requests. It is recommended
154
+ # that you fix these values when running a production server!
155
+ # If you're using threadsafe! mode, you probably don't want to set these values,
156
+ # since 1 runtime(default for threadsafe mode) will be enough.
157
+ # config.webxml.jruby.min.runtimes = 2
158
+ # config.webxml.jruby.max.runtimes = 4
159
+
160
+ # JNDI data source name
161
+ # config.webxml.jndi = 'jdbc/rails'
162
+ end
data/config.ru ADDED
@@ -0,0 +1,2 @@
1
+ require 'llt/segmenter/api'
2
+ run Api
@@ -0,0 +1,20 @@
1
+ require 'sinatra/base'
2
+ require 'sinatra/respond_with'
3
+ require 'llt/segmenter'
4
+ require 'llt/core/api'
5
+
6
+ class Api < Sinatra::Base
7
+ register Sinatra::RespondWith
8
+ helpers LLT::Core::Api::Helpers
9
+
10
+ get '/segment' do
11
+ typecast_params!(params)
12
+ text = extract_text(params)
13
+ segmenter = LLT::Segmenter.new(params)
14
+ sentences = segmenter.segment(text)
15
+
16
+ respond_to do |f|
17
+ f.xml { to_xml(sentences, params) }
18
+ end
19
+ end
20
+ end
@@ -0,0 +1,5 @@
1
+ module LLT
2
+ class Segmenter
3
+ VERSION = "0.0.1"
4
+ end
5
+ end
@@ -0,0 +1,97 @@
1
+ require "llt/constants"
2
+ require "llt/core"
3
+ require "llt/logger"
4
+ require "llt/sentence"
5
+
6
+ module LLT
7
+ class Segmenter
8
+ include Constants::Abbreviations
9
+ include Core::Serviceable
10
+
11
+ uses_logger { Logger.new('Segmenter', default: :debug) }
12
+
13
+ def self.default_options
14
+ {
15
+ indexing: true,
16
+ newline_boundary: 2
17
+ }
18
+ end
19
+
20
+ # Abbreviations with boundary e.g. \bA
21
+ #
22
+ # This doesn't work in jruby (opened an issue at jruby/jruby#1269 ),
23
+ # so we have to change things as long as this is not fixed.
24
+ #
25
+ # (?<=\s|^) can be just \b in MRI 2.0 and upwards
26
+ AWB = ALL_ABBRS_PIPED.split('|').map { |abbr| "(?<=\\s|^)#{abbr}" }.join('|')
27
+ SENTENCE_CLOSER = /(?<!#{AWB})\.(?!\.)|[;\?!:]/
28
+ DIRECT_SPEECH_DELIMITER = /['"”]/
29
+ TRAILERS = /\)|<\/.*?>/
30
+
31
+ def segment(string, add_to: nil, **options)
32
+ setup(options)
33
+ # dump whitespace at the beginning and end!
34
+ string.strip!
35
+ sentences = scan_through_string(StringScanner.new(string))
36
+ add_to << sentences if add_to.respond_to?(:<<)
37
+ sentences
38
+ end
39
+
40
+ private
41
+
42
+ def setup(options)
43
+ @indexing = parse_option(:indexing, options)
44
+ @id = 0 if @indexing
45
+
46
+ nl_boundary = parse_option(:newline_boundary, options)
47
+ @sentence_closer = Regexp.union(SENTENCE_CLOSER, /\n{#{nl_boundary}}/)
48
+ end
49
+
50
+ def scan_through_string(scanner, sentences = [])
51
+ while scanner.rest?
52
+ sentence = scanner.scan_until(@sentence_closer) ||
53
+ rescue_no_delimiters(sentences, scanner)
54
+ sentence << trailing_delimiters(scanner)
55
+
56
+ sentence.strip!
57
+ unless sentence.empty?
58
+ curr_id = id
59
+ @logger.log("Segmented #{curr_id} #{sentence}")
60
+ sentences << Sentence.new(sentence, curr_id)
61
+ end
62
+ end
63
+ sentences
64
+ end
65
+
66
+ def id
67
+ if @indexing
68
+ @id += 1
69
+ end
70
+ end
71
+
72
+ def rescue_no_delimiters(sentences, scanner)
73
+ if sentences.any?
74
+ # broken off texts
75
+ scanner.scan_until(/$/)
76
+ else
77
+ # try a simple newline as delimiter, if there was no delimiter
78
+ scanner.reset
79
+ @sentence_closer = /\n/
80
+ if sent = scanner.scan_until(@sentence_closer)
81
+ sent
82
+ else
83
+ # when there is not even a new line, return all input
84
+ scanner.terminate
85
+ scanner.string
86
+ end
87
+ end
88
+ end
89
+
90
+ def trailing_delimiters(scanner)
91
+ trailers = [DIRECT_SPEECH_DELIMITER, TRAILERS]
92
+ trailers.each_with_object('') do |trailer, str|
93
+ str << scanner.scan(trailer).to_s # catches nil
94
+ end
95
+ end
96
+ end
97
+ end
@@ -0,0 +1,10 @@
1
+ require 'llt/core/containable'
2
+
3
+ module LLT
4
+ class Sentence
5
+ include Core::Containable
6
+
7
+ xml_tag 's'
8
+ container_alias :tokens
9
+ end
10
+ end
@@ -0,0 +1,29 @@
1
+ # coding: utf-8
2
+ lib = File.expand_path('../lib', __FILE__)
3
+ $LOAD_PATH.unshift(lib) unless $LOAD_PATH.include?(lib)
4
+ require 'llt/segmenter/version'
5
+
6
+ Gem::Specification.new do |spec|
7
+ spec.name = "llt-segmenter"
8
+ spec.version = LLT::Segmenter::VERSION
9
+ spec.authors = ["Gernot Höflechner, Robert Lichstensteiner, Christof Sirk"]
10
+ spec.email = ["latin.language.toolkit@gmail.com"]
11
+ spec.description = %q{Segments text into sentences}
12
+ spec.summary = %q{Segments text into sentences}
13
+ spec.homepage = "http://latin-language-toolkit.net"
14
+ spec.license = "MIT"
15
+
16
+ spec.files = `git ls-files`.split($/)
17
+ spec.executables = spec.files.grep(%r{^bin/}) { |f| File.basename(f) }
18
+ spec.test_files = spec.files.grep(%r{^(test|spec|features)/})
19
+ spec.require_paths = ["lib"]
20
+
21
+ spec.add_development_dependency "bundler", "~> 1.3"
22
+ spec.add_development_dependency "rake"
23
+ spec.add_development_dependency "rspec"
24
+ spec.add_development_dependency "simplecov", "~> 0.7"
25
+ spec.add_dependency "warbler"
26
+ #spec.add_dependency "llt-core"
27
+ #spec.add_dependency "llt-constants"
28
+ #spec.add_dependency "llt-logger"
29
+ end
@@ -0,0 +1,56 @@
1
+ ENV['RACK_ENV'] = 'test'
2
+
3
+ require 'spec_helper'
4
+ require 'llt/segmenter/api'
5
+ require 'rack/test'
6
+
7
+ def app
8
+ Api
9
+ end
10
+
11
+ describe "segmenter api" do
12
+ include Rack::Test::Methods
13
+
14
+ describe '/segment' do
15
+ context "with URI as input" do
16
+ end
17
+
18
+ let(:text) {{text: "homo mittit. Marcus est."}}
19
+
20
+ context "with text as input" do
21
+ context "with accept header json" do
22
+ it "segments the given sentences" do
23
+ pending
24
+ get '/segment', text,
25
+ {"HTTP_ACCEPT" => "application/json"}
26
+ last_response.should be_ok
27
+ response = last_response.body
28
+ parsed_response = JSON.parse(response)
29
+ parsed_response.should have(2).items
30
+ end
31
+ end
32
+
33
+ context "with accept header xml" do
34
+ it "segments the given sentences" do
35
+ get '/segment', text,
36
+ {"HTTP_ACCEPT" => "application/xml"}
37
+ last_response.should be_ok
38
+ body = last_response.body
39
+ body.should =~ /<s n="1">homo mittit\.<\/s>/
40
+ body.should =~ /<s n="2">Marcus est\.<\/s>/
41
+ end
42
+
43
+ it "receives params for segmentation and markup" do
44
+ params = { indexing: false }.merge(text)
45
+
46
+ get '/segment', params,
47
+ {"HTTP_ACCEPT" => "application/xml"}
48
+ last_response.should be_ok
49
+ body = last_response.body
50
+ body.should =~ /<s>homo mittit\.<\/s>/
51
+ body.should =~ /<s>Marcus est\.<\/s>/
52
+ end
53
+ end
54
+ end
55
+ end
56
+ end
@@ -0,0 +1,259 @@
1
+ require 'spec_helper'
2
+
3
+ describe LLT::Segmenter do
4
+ let(:segmenter) { LLT::Segmenter.new }
5
+ describe "#segment" do
6
+ it "returns an array of LLT::Sentence elements" do
7
+ sentences = segmenter.segment("est.")
8
+ sentences.should have(1).item
9
+ sentences.first.should be_a LLT::Sentence
10
+ end
11
+
12
+ it "segments a paragraph of into sentences - easy" do
13
+ txt = "Cicero est. Caesar est."
14
+ sentences = segmenter.segment(txt)
15
+ sentences.should have(2).items
16
+ sentences[0].to_s.should == "Cicero est."
17
+ sentences[1].to_s.should == "Caesar est."
18
+ end
19
+
20
+ it "segments a paragraph of into sentences - complex" do
21
+ txt = "Cicero est; quis Caesar est? Marcus Antonius!"
22
+ sentences = segmenter.segment(txt)
23
+ sentences.should have(3).items
24
+ sentences[0].to_s.should == "Cicero est;"
25
+ sentences[1].to_s.should == "quis Caesar est?"
26
+ sentences[2].to_s.should == "Marcus Antonius!"
27
+ end
28
+
29
+ it "creates indices by default" do
30
+ txt = "Cicero est; quis Caesar est? Marcus Antonius!"
31
+ sentences = segmenter.segment(txt)
32
+ sentences.map(&:id).should == [1, 2, 3]
33
+ end
34
+
35
+ it "indices can be turned off" do
36
+ txt = "Cicero est; quis Caesar est? Marcus Antonius!"
37
+ sentences = segmenter.segment(txt, indexing: false)
38
+ sentences.map(&:id).should == [nil, nil, nil]
39
+ end
40
+
41
+ it "handles abbreviated names" do
42
+ txt = "C. Caesar est. M. Tullius Cicero est."
43
+ sentences = segmenter.segment(txt)
44
+ sentences.should have(2).items
45
+ sentences[0].to_s.should == "C. Caesar est."
46
+ sentences[1].to_s.should == "M. Tullius Cicero est."
47
+ end
48
+
49
+ it "handles abbreviated dates" do
50
+ txt = "Is dies erat a. d. V Kal. Apr. L. Pisone, A. Gabinio consulibus."
51
+ sentences = segmenter.segment(txt)
52
+ sentences.should have(1).item
53
+ end
54
+
55
+ it "handles more dates" do
56
+ txt = "Is dies erat a. d. V Ian. Non. Feb. L. App. Pisone ."
57
+ sentences = segmenter.segment(txt)
58
+ puts sentences
59
+ sentences.should have(1).item
60
+ end
61
+
62
+ it "are only triggered when they have a leading word boundary" do
63
+ # spec might seem strange, but this didn't work from the start on
64
+ txt = "erat nauta. est."
65
+ sentences = segmenter.segment(txt)
66
+ sentences.should have(2).items
67
+ end
68
+
69
+ it "handles dates even with numbers that have an abbr dot" do
70
+ pending('Not solved yet. Think of M.') do
71
+ txt = "Is dies erat a. d. V. Kal. Apr. L. Pisone, A. Gabinio consulibus."
72
+ sentences = segmenter.segment(txt)
73
+ sentences.should have(1).item
74
+ end
75
+ end
76
+
77
+ it "splits at :" do
78
+ txt = 'iubent: fugere manus.'
79
+ sentences = segmenter.segment(txt)
80
+ sentences.should have(2).items
81
+ end
82
+
83
+ it "doesn't create empty sentences" do
84
+ txt = "text.\n\n\ntext."
85
+ sentences = segmenter.segment(txt)
86
+ sentences.should have(2).items
87
+ end
88
+
89
+ context "with embedded xml" do
90
+ it "doesn't break up before xml closing tags" do
91
+ txt = '<grc> text.</grc>'
92
+ sentences = segmenter.segment(txt)
93
+ sentences.should have(1).item
94
+ end
95
+ end
96
+
97
+ context "newline (\\n) handling" do
98
+ it "works when in between" do
99
+ txt = "Filia est.\nFilius est."
100
+ sentences = segmenter.segment(txt)
101
+ sentences.should have(2).items
102
+ sentences[0].to_s.should == "Filia est."
103
+ sentences[1].to_s.should == "Filius est."
104
+ end
105
+
106
+ it "works when at the end of a text" do
107
+ sentences = segmenter.segment("Marcus est.\n")
108
+ sentences.should have(1).item
109
+ sentences.first.to_s.should == 'Marcus est.'
110
+ end
111
+
112
+ it "works with newline and space in between and no new line at the end" do
113
+ txt = "Fīlius rēgīnae erat.\n Rēgīnam aurō dōnābunt."
114
+ sentences = segmenter.segment(txt)
115
+ sentences.should have(2).items
116
+ sentences[0].to_s.should == "Fīlius rēgīnae erat."
117
+ sentences[1].to_s.should == "Rēgīnam aurō dōnābunt."
118
+ end
119
+
120
+ it "works with newline and space in between and new line at the end" do
121
+ txt = "Fīlius rēgīnae erat nauta.\n Rēgīnam aurō dōnābunt.\n"
122
+ sentences = segmenter.segment(txt)
123
+ sentences.should have(2).items
124
+ sentences[0].to_s.should == "Fīlius rēgīnae erat nauta."
125
+ sentences[1].to_s.should == "Rēgīnam aurō dōnābunt."
126
+ end
127
+
128
+ it "treats an empty line as delimiter - might e.g. appear in book titles" do
129
+ txt = "Marcus est\n\nMarcus est."
130
+ sentences = segmenter.segment(txt)
131
+ sentences.should have(2).item
132
+ end
133
+
134
+ it "number of newlines that count as sentence boundary can be given as option" do
135
+ txt1 = "Marcus est\n\nMarcus est."
136
+ txt2 = "Marcus est\n\n\nMarcus est."
137
+ sentences1 = segmenter.segment(txt1, newline_boundary: 3)
138
+ sentences2 = segmenter.segment(txt2, newline_boundary: 3)
139
+ sentences1.should have(1).item
140
+ sentences2.should have(2).item
141
+ end
142
+ end
143
+
144
+ it "handles quantified texts" do
145
+ txt = "Fēmina puellae pecūniam dabat.\n Fīlia poētae in viīs errābat.\n"
146
+ sentences = segmenter.segment(txt)
147
+ sentences.should have(2).item
148
+ end
149
+
150
+ it "is not disturbed by leading or trailing whitespace" do
151
+ txt = ' Marcus est. Marcus est. '
152
+ sentences = segmenter.segment(txt)
153
+ sentences.should have(2).item
154
+ end
155
+
156
+ context "with ellipsis punctuation" do
157
+ it "handles them at the end of a sentence" do
158
+ txt = 'Marcus ...'
159
+ sentences = segmenter.segment(txt)
160
+ sentences.should have(1).item
161
+ end
162
+
163
+ it "handles them in the midst of a sentence" do
164
+ pending 'Tough to do'
165
+ end
166
+ end
167
+
168
+ context "direct speech delimiter" do
169
+ context "with '" do
170
+ it "handles basic cases when on the outside of the punctuation" do
171
+ txt = "'Marcus est.'"
172
+ sentences = segmenter.segment(txt)
173
+ sentences.should have(1).item
174
+ end
175
+
176
+ it "handles basic cases when on the inside of the punctuation" do
177
+ txt = "'Marcus est'?"
178
+ sentences = segmenter.segment(txt)
179
+ sentences.should have(1).item
180
+ end
181
+ end
182
+
183
+ context 'with "' do
184
+ it "handles basic cases when on the outside of the punctuation" do
185
+ txt = '"Marcus est."'
186
+ sentences = segmenter.segment(txt)
187
+ sentences.should have(1).item
188
+ end
189
+
190
+ it "handles basic cases when on the inside of the punctuation" do
191
+ txt = '"Marcus est"?'
192
+ sentences = segmenter.segment(txt)
193
+ sentences.should have(1).item
194
+ end
195
+ end
196
+
197
+ context 'with ” (attention: this is NOT the same as "' do
198
+ it "handles basic cases when on the outside of the punctuation" do
199
+ txt = '”Marcus est.”'
200
+ sentences = segmenter.segment(txt)
201
+ sentences.should have(1).item
202
+ end
203
+
204
+ it "handles basic cases when on the inside of the punctuation" do
205
+ txt = '”Marcus est”?'
206
+ sentences = segmenter.segment(txt)
207
+ sentences.should have(1).item
208
+ end
209
+ end
210
+ end
211
+
212
+ it "catches trailing parenthesis" do
213
+ txt = "Marcus est. (Marcus est.) Marcus est."
214
+ sentences = segmenter.segment(txt)
215
+ sentences.should have(3).items
216
+ sentences[0].to_s.should == 'Marcus est.'
217
+ sentences[1].to_s.should == '(Marcus est.)'
218
+ sentences[2].to_s.should == 'Marcus est.'
219
+ end
220
+
221
+ it "handles broken off texts - the rest is an own sentence" do
222
+ txt = "Marcus est. Marcus est"
223
+ sentences = segmenter.segment(txt)
224
+ sentences.should have(2).item
225
+ end
226
+
227
+ context "with no delimiters present" do
228
+ it "tries to fallback to single newline boundary" do
229
+ txt = "Marcus est\nMarcus est"
230
+ segmenter.segment(txt).should have(2).items
231
+ end
232
+
233
+ it "returns the whole input as segment when there are no newlines" do
234
+ txt = "Marcus est"
235
+ segmenter.segment(txt).should have(1).item
236
+ end
237
+ end
238
+
239
+ describe "takes an optional keyword argument add_to" do
240
+ class ParagraphDummy
241
+ attr_reader :sentences
242
+ def initialize; @sentences = []; end
243
+ def <<(sentences); @sentences += sentences; end
244
+ end
245
+
246
+ it "adds the result to the given object if #<< is implemented" do
247
+ paragraph = ParagraphDummy.new
248
+ s = segmenter.segment("", add_to: paragraph)
249
+ paragraph.sentences.should == s
250
+ end
251
+
252
+ it "does nothing to the given object when #<< it does not respond to" do
253
+ object = double(respond_to?: false)
254
+ object.should_not receive(:<<)
255
+ segmenter.segment("", add_to: object)
256
+ end
257
+ end
258
+ end
259
+ end
@@ -0,0 +1,26 @@
1
+ require 'simplecov'
2
+ require 'coveralls'
3
+
4
+ Coveralls.wear!
5
+
6
+ SimpleCov.formatter = SimpleCov::Formatter::MultiFormatter[
7
+ SimpleCov::Formatter::HTMLFormatter,
8
+ Coveralls::SimpleCov::Formatter
9
+ ]
10
+
11
+ SimpleCov.start do
12
+ add_filter '/spec/'
13
+ end
14
+
15
+ $LOAD_PATH.unshift File.expand_path('../../lib', __FILE__)
16
+ require 'llt/segmenter'
17
+
18
+ if defined?(LLT::Logger)
19
+ LLT::Logger.level = nil
20
+ end
21
+
22
+ RSpec.configure do |config|
23
+ config.treat_symbols_as_metadata_keys_with_true_values = true
24
+ config.run_all_when_everything_filtered = true
25
+ config.filter_run :focus
26
+ end
metadata ADDED
@@ -0,0 +1,134 @@
1
+ --- !ruby/object:Gem::Specification
2
+ name: llt-segmenter
3
+ version: !ruby/object:Gem::Version
4
+ version: 0.0.1
5
+ platform: ruby
6
+ authors:
7
+ - Gernot Höflechner, Robert Lichstensteiner, Christof Sirk
8
+ autorequire:
9
+ bindir: bin
10
+ cert_chain: []
11
+ date: 2013-12-08 00:00:00.000000000 Z
12
+ dependencies:
13
+ - !ruby/object:Gem::Dependency
14
+ name: bundler
15
+ requirement: !ruby/object:Gem::Requirement
16
+ requirements:
17
+ - - "~>"
18
+ - !ruby/object:Gem::Version
19
+ version: '1.3'
20
+ type: :development
21
+ prerelease: false
22
+ version_requirements: !ruby/object:Gem::Requirement
23
+ requirements:
24
+ - - "~>"
25
+ - !ruby/object:Gem::Version
26
+ version: '1.3'
27
+ - !ruby/object:Gem::Dependency
28
+ name: rake
29
+ requirement: !ruby/object:Gem::Requirement
30
+ requirements:
31
+ - - ">="
32
+ - !ruby/object:Gem::Version
33
+ version: '0'
34
+ type: :development
35
+ prerelease: false
36
+ version_requirements: !ruby/object:Gem::Requirement
37
+ requirements:
38
+ - - ">="
39
+ - !ruby/object:Gem::Version
40
+ version: '0'
41
+ - !ruby/object:Gem::Dependency
42
+ name: rspec
43
+ requirement: !ruby/object:Gem::Requirement
44
+ requirements:
45
+ - - ">="
46
+ - !ruby/object:Gem::Version
47
+ version: '0'
48
+ type: :development
49
+ prerelease: false
50
+ version_requirements: !ruby/object:Gem::Requirement
51
+ requirements:
52
+ - - ">="
53
+ - !ruby/object:Gem::Version
54
+ version: '0'
55
+ - !ruby/object:Gem::Dependency
56
+ name: simplecov
57
+ requirement: !ruby/object:Gem::Requirement
58
+ requirements:
59
+ - - "~>"
60
+ - !ruby/object:Gem::Version
61
+ version: '0.7'
62
+ type: :development
63
+ prerelease: false
64
+ version_requirements: !ruby/object:Gem::Requirement
65
+ requirements:
66
+ - - "~>"
67
+ - !ruby/object:Gem::Version
68
+ version: '0.7'
69
+ - !ruby/object:Gem::Dependency
70
+ name: warbler
71
+ requirement: !ruby/object:Gem::Requirement
72
+ requirements:
73
+ - - ">="
74
+ - !ruby/object:Gem::Version
75
+ version: '0'
76
+ type: :runtime
77
+ prerelease: false
78
+ version_requirements: !ruby/object:Gem::Requirement
79
+ requirements:
80
+ - - ">="
81
+ - !ruby/object:Gem::Version
82
+ version: '0'
83
+ description: Segments text into sentences
84
+ email:
85
+ - latin.language.toolkit@gmail.com
86
+ executables: []
87
+ extensions: []
88
+ extra_rdoc_files: []
89
+ files:
90
+ - ".gitignore"
91
+ - ".rspec"
92
+ - ".travis.yml"
93
+ - Gemfile
94
+ - LICENSE
95
+ - README.md
96
+ - Rakefile
97
+ - config.ru
98
+ - config/warble.rb
99
+ - lib/llt/segmenter.rb
100
+ - lib/llt/segmenter/api.rb
101
+ - lib/llt/segmenter/version.rb
102
+ - lib/llt/sentence.rb
103
+ - llt-segmenter.gemspec
104
+ - spec/lib/llt/segmenter/api_spec.rb
105
+ - spec/lib/llt/segmenter_spec.rb
106
+ - spec/spec_helper.rb
107
+ homepage: http://latin-language-toolkit.net
108
+ licenses:
109
+ - MIT
110
+ metadata: {}
111
+ post_install_message:
112
+ rdoc_options: []
113
+ require_paths:
114
+ - lib
115
+ required_ruby_version: !ruby/object:Gem::Requirement
116
+ requirements:
117
+ - - ">="
118
+ - !ruby/object:Gem::Version
119
+ version: '0'
120
+ required_rubygems_version: !ruby/object:Gem::Requirement
121
+ requirements:
122
+ - - ">="
123
+ - !ruby/object:Gem::Version
124
+ version: '0'
125
+ requirements: []
126
+ rubyforge_project:
127
+ rubygems_version: 2.1.5
128
+ signing_key:
129
+ specification_version: 4
130
+ summary: Segments text into sentences
131
+ test_files:
132
+ - spec/lib/llt/segmenter/api_spec.rb
133
+ - spec/lib/llt/segmenter_spec.rb
134
+ - spec/spec_helper.rb