llt-segmenter 0.0.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml ADDED
@@ -0,0 +1,7 @@
1
+ ---
2
+ SHA1:
3
+ metadata.gz: 4759c4666499718fd9364ff782a16f5d7ef818d1
4
+ data.tar.gz: cb27484995b057c79017ac11740dc68284f0085b
5
+ SHA512:
6
+ metadata.gz: abb24d73dc6029e91bb5ba14d81fcd0244ec7cc1619d7848bddff4df785c38dcd1c069877e368930289a248e3e79a905a3d4771e7b7fa6b7087c33427c04d54d
7
+ data.tar.gz: dd8daa449bb083a62ebd6656c6c6221e2af9c36cb0d22fdd95775b8e781f312125857380abc43a854c6e282cda797f77331afe65f0fd5b2f04d6d5e4d417b389
data/.gitignore ADDED
@@ -0,0 +1,17 @@
1
+ *.gem
2
+ *.rbc
3
+ .bundle
4
+ .config
5
+ .yardoc
6
+ Gemfile.lock
7
+ InstalledFiles
8
+ _yardoc
9
+ coverage
10
+ doc/
11
+ lib/bundler/man
12
+ pkg
13
+ rdoc
14
+ spec/reports
15
+ test/tmp
16
+ test/version_tmp
17
+ tmp
data/.rspec ADDED
@@ -0,0 +1,3 @@
1
+ --format documentation
2
+ --color
3
+ --tty
data/.travis.yml ADDED
@@ -0,0 +1,6 @@
1
+ language: ruby
2
+ before_script:
3
+ - "export JRUBY_OPTS=--2.0"
4
+ rvm:
5
+ - 2.0.0
6
+ - jruby-20mode
data/Gemfile ADDED
@@ -0,0 +1,15 @@
1
+ source 'https://rubygems.org'
2
+
3
+ # Specify your gem's dependencies in llt-segmenter.gemspec
4
+ gemspec
5
+ gem 'pry'
6
+
7
+ gem 'coveralls', require: false
8
+
9
+ gem 'llt-core', git: 'https://github.com/latin-language-toolkit/llt-core.git'
10
+ gem 'llt-constants', git: 'https://github.com/latin-language-toolkit/llt-constants.git'
11
+ gem 'llt-logger', git: 'https://github.com/latin-language-toolkit/llt-logger.git'
12
+
13
+ platform :jruby do
14
+ gem 'jruby-httpclient'
15
+ end
data/LICENSE ADDED
@@ -0,0 +1,20 @@
1
+ The MIT License (MIT)
2
+
3
+ Copyright (c) 2013 latin-language-toolkit
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining a copy of
6
+ this software and associated documentation files (the "Software"), to deal in
7
+ the Software without restriction, including without limitation the rights to
8
+ use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of
9
+ the Software, and to permit persons to whom the Software is furnished to do so,
10
+ subject to the following conditions:
11
+
12
+ The above copyright notice and this permission notice shall be included in all
13
+ copies or substantial portions of the Software.
14
+
15
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS
17
+ FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR
18
+ COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER
19
+ IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
20
+ CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
data/README.md ADDED
@@ -0,0 +1,45 @@
1
+ # LLT::Segmenter
2
+
3
+ [![Version](http://allthebadges.io/latin-language-toolkit/llt-segmenter/badge_fury.png)](http://allthebadges.io/latin-language-toolkit/llt-segmenter/badge_fury)
4
+ [![Build Status](https://travis-ci.org/latin-language-toolkit/llt-segmenter.png?branch=master)](https://travis-ci.org/latin-language-toolkit/llt-segmenter)
5
+ [![Dependencies](http://allthebadges.io/latin-language-toolkit/llt-segmenter/gemnasium.png)](http://allthebadges.io/latin-language-toolkit/llt-segmenter/gemnasium)
6
+ [![Coverage](https://coveralls.io/repos/latin-language-toolkit/llt-segmenter/badge.png?branch=master)](https://coveralls.io/r/latin-language-toolkit/llt-segmenter?branch=master)
7
+ [![Code Climate](https://codeclimate.com/github/latin-language-toolkit/llt-segmenter.png)](https://codeclimate.com/github/latin-language-toolkit/llt-segmenter)
8
+
9
+ Segments text into sentences.
10
+
11
+ ## Installation
12
+
13
+ Add this line to your application's Gemfile:
14
+
15
+ gem 'llt-segmenter'
16
+
17
+ And then execute:
18
+
19
+ $ bundle
20
+
21
+ Or install it yourself as:
22
+
23
+ $ gem install llt-segmenter
24
+
25
+ ## Usage
26
+
27
+ TODO: Write usage instructions here
28
+
29
+ ## API
30
+ This currently is a list of requirements and will transform into an API documentation.
31
+
32
+ Input:
33
+ - Text or (URI)
34
+ - Black-/Whitelist for separators.
35
+
36
+ Output:
37
+ - XML (TEI) or JSON
38
+
39
+ ## Contributing
40
+
41
+ 1. Fork it
42
+ 2. Create your feature branch (`git checkout -b my-new-feature`)
43
+ 3. Commit your changes (`git commit -am 'Add some feature'`)
44
+ 4. Push to the branch (`git push origin my-new-feature`)
45
+ 5. Create new Pull Request
data/Rakefile ADDED
@@ -0,0 +1,7 @@
1
+ require "bundler/gem_tasks"
2
+ require "rspec/core/rake_task"
3
+
4
+ RSpec::Core::RakeTask.new(:spec)
5
+
6
+ task :default => :spec
7
+
data/config/warble.rb ADDED
@@ -0,0 +1,162 @@
1
+ # Disable Rake-environment-task framework detection by uncommenting/setting to false
2
+ # Warbler.framework_detection = false
3
+
4
+ # Warbler web application assembly configuration file
5
+ Warbler::Config.new do |config|
6
+ # Features: additional options controlling how the jar is built.
7
+ # Currently the following features are supported:
8
+ # - gemjar: package the gem repository in a jar file in WEB-INF/lib
9
+ # - executable: embed a web server and make the war executable
10
+ # - compiled: compile .rb files to .class files
11
+ # config.features = %w(gemjar)
12
+
13
+ # Application directories to be included in the webapp.
14
+ # config.dirs = %w(app config db lib log script vendor tmp)
15
+
16
+ # Additional files/directories to include, above those in config.dirs
17
+ # config.includes = FileList["db"]
18
+
19
+ # Additional files/directories to exclude
20
+ # config.excludes = FileList["lib/tasks/*"]
21
+
22
+ # Additional Java .jar files to include. Note that if .jar files are placed
23
+ # in lib (and not otherwise excluded) then they need not be mentioned here.
24
+ # JRuby and JRuby-Rack are pre-loaded in this list. Be sure to include your
25
+ # own versions if you directly set the value
26
+ # config.java_libs += FileList["lib/java/*.jar"]
27
+
28
+ # Loose Java classes and miscellaneous files to be included.
29
+ # config.java_classes = FileList["target/classes/**.*"]
30
+
31
+ # One or more pathmaps defining how the java classes should be copied into
32
+ # the archive. The example pathmap below accompanies the java_classes
33
+ # configuration above. See http://rake.rubyforge.org/classes/String.html#M000017
34
+ # for details of how to specify a pathmap.
35
+ # config.pathmaps.java_classes << "%{target/classes/,}p"
36
+
37
+ # Bundler support is built-in. If Warbler finds a Gemfile in the
38
+ # project directory, it will be used to collect the gems to bundle
39
+ # in your application. If you wish to explicitly disable this
40
+ # functionality, uncomment here.
41
+ # config.bundler = false
42
+
43
+ # An array of Bundler groups to avoid including in the war file.
44
+ # Defaults to ["development", "test", "assets"].
45
+ # config.bundle_without = []
46
+
47
+ # Other gems to be included. If you don't use Bundler or a gemspec
48
+ # file, you need to tell Warbler which gems your application needs
49
+ # so that they can be packaged in the archive.
50
+ # For Rails applications, the Rails gems are included by default
51
+ # unless the vendor/rails directory is present.
52
+ # config.gems += ["activerecord-jdbcmysql-adapter", "jruby-openssl"]
53
+ # config.gems << "tzinfo"
54
+
55
+ # Uncomment this if you don't want to package rails gem.
56
+ # config.gems -= ["rails"]
57
+
58
+ # The most recent versions of gems are used.
59
+ # You can specify versions of gems by using a hash assignment:
60
+ # config.gems["rails"] = "2.3.10"
61
+
62
+ # You can also use regexps or Gem::Dependency objects for flexibility or
63
+ # finer-grained control.
64
+ # config.gems << /^merb-/
65
+ # config.gems << Gem::Dependency.new("merb-core", "= 0.9.3")
66
+
67
+ # Include gem dependencies not mentioned specifically. Default is
68
+ # true, uncomment to turn off.
69
+ # config.gem_dependencies = false
70
+
71
+ # Array of regular expressions matching relative paths in gems to be
72
+ # excluded from the war. Defaults to empty, but you can set it like
73
+ # below, which excludes test files.
74
+ # config.gem_excludes = [/^(test|spec)\//]
75
+
76
+ # Pathmaps for controlling how application files are copied into the archive
77
+ # config.pathmaps.application = ["WEB-INF/%p"]
78
+
79
+ # Name of the archive (without the extension). Defaults to the basename
80
+ # of the project directory.
81
+ # config.jar_name = "mywar"
82
+
83
+ # Name of the MANIFEST.MF template for the war file. Defaults to a simple
84
+ # MANIFEST.MF that contains the version of Warbler used to create the war file.
85
+ # config.manifest_file = "config/MANIFEST.MF"
86
+
87
+ # When using the 'compiled' feature and specified, only these Ruby
88
+ # files will be compiled. Default is to compile all \.rb files in
89
+ # the application.
90
+ # config.compiled_ruby_files = FileList['app/**/*.rb']
91
+
92
+ # When set to true, Warbler will override the value of ENV['GEM_HOME'] even it
93
+ # has already been set. When set to false it will use any existing value of
94
+ # GEM_HOME if it is set.
95
+ # config.override_gem_home = true
96
+
97
+ # Allows for specifing custom executables
98
+ # config.executable = ["rake", "bin/rake"]
99
+
100
+ # Sets default (prefixed) parameters for the executables
101
+ # config.executable_params = "do:something"
102
+
103
+ # === War files only below here ===
104
+
105
+ # Path to the pre-bundled gem directory inside the war file. Default
106
+ # is 'WEB-INF/gems'. Specify path if gems are already bundled
107
+ # before running Warbler. This also sets 'gem.path' inside web.xml.
108
+ # config.gem_path = "WEB-INF/vendor/bundler_gems"
109
+
110
+ # Files for WEB-INF directory (next to web.xml). This contains
111
+ # web.xml by default. If there is an .erb-File it will be processed
112
+ # with webxml-config. You may want to exclude this file via
113
+ # config.excludes.
114
+ # config.webinf_files += FileList["jboss-web.xml"]
115
+
116
+ # Files to be included in the root of the webapp. Note that files in public
117
+ # will have the leading 'public/' part of the path stripped during staging.
118
+ # config.public_html = FileList["public/**/*", "doc/**/*"]
119
+
120
+ # Pathmaps for controlling how public HTML files are copied into the .war
121
+ # config.pathmaps.public_html = ["%{public/,}p"]
122
+
123
+ # Embedded webserver to use with the 'executable' feature. Currently supported
124
+ # webservers are:
125
+ # * <tt>winstone</tt> (default) - Winstone 0.9.10 from sourceforge
126
+ # * <tt>jenkins-ci.winstone</tt> - Improved Winstone from Jenkins CI
127
+ # * <tt>jetty</tt> - Embedded Jetty from Eclipse
128
+ # config.webserver = 'jetty'
129
+
130
+ # Value of RAILS_ENV for the webapp -- default as shown below
131
+ # config.webxml.rails.env = ENV['RAILS_ENV'] || 'production'
132
+
133
+ # Application booter to use, one of :rack, :rails, or :merb (autodetected by default)
134
+ # config.webxml.booter = :rails
135
+
136
+ # Set JRuby to run in 1.9 mode.
137
+ # config.webxml.jruby.compat.version = "1.9"
138
+
139
+ # When using the :rack booter, "Rackup" script to use.
140
+ # - For 'rackup.path', the value points to the location of the rackup
141
+ # script in the web archive file. You need to make sure this file
142
+ # gets included in the war, possibly by adding it to config.includes
143
+ # or config.webinf_files above.
144
+ # - For 'rackup', the rackup script you provide as an inline string
145
+ # is simply embedded in web.xml.
146
+ # The script is evaluated in a Rack::Builder to load the application.
147
+ # Examples:
148
+ # config.webxml.rackup.path = 'WEB-INF/hello.ru'
149
+ # config.webxml.rackup = %{require './lib/demo'; run Rack::Adapter::Camping.new(Demo)}
150
+ # config.webxml.rackup = require 'cgi' && CGI::escapeHTML(File.read("config.ru"))
151
+
152
+ # Control the pool of Rails runtimes. Leaving unspecified means
153
+ # the pool will grow as needed to service requests. It is recommended
154
+ # that you fix these values when running a production server!
155
+ # If you're using threadsafe! mode, you probably don't want to set these values,
156
+ # since 1 runtime(default for threadsafe mode) will be enough.
157
+ # config.webxml.jruby.min.runtimes = 2
158
+ # config.webxml.jruby.max.runtimes = 4
159
+
160
+ # JNDI data source name
161
+ # config.webxml.jndi = 'jdbc/rails'
162
+ end
data/config.ru ADDED
@@ -0,0 +1,2 @@
1
+ require 'llt/segmenter/api'
2
+ run Api
@@ -0,0 +1,20 @@
1
+ require 'sinatra/base'
2
+ require 'sinatra/respond_with'
3
+ require 'llt/segmenter'
4
+ require 'llt/core/api'
5
+
6
+ class Api < Sinatra::Base
7
+ register Sinatra::RespondWith
8
+ helpers LLT::Core::Api::Helpers
9
+
10
+ get '/segment' do
11
+ typecast_params!(params)
12
+ text = extract_text(params)
13
+ segmenter = LLT::Segmenter.new(params)
14
+ sentences = segmenter.segment(text)
15
+
16
+ respond_to do |f|
17
+ f.xml { to_xml(sentences, params) }
18
+ end
19
+ end
20
+ end
@@ -0,0 +1,5 @@
1
+ module LLT
2
+ class Segmenter
3
+ VERSION = "0.0.1"
4
+ end
5
+ end
@@ -0,0 +1,97 @@
1
+ require "llt/constants"
2
+ require "llt/core"
3
+ require "llt/logger"
4
+ require "llt/sentence"
5
+
6
+ module LLT
7
+ class Segmenter
8
+ include Constants::Abbreviations
9
+ include Core::Serviceable
10
+
11
+ uses_logger { Logger.new('Segmenter', default: :debug) }
12
+
13
+ def self.default_options
14
+ {
15
+ indexing: true,
16
+ newline_boundary: 2
17
+ }
18
+ end
19
+
20
+ # Abbreviations with boundary e.g. \bA
21
+ #
22
+ # This doesn't work in jruby (opened an issue at jruby/jruby#1269 ),
23
+ # so we have to change things as long as this is not fixed.
24
+ #
25
+ # (?<=\s|^) can be just \b in MRI 2.0 and upwards
26
+ AWB = ALL_ABBRS_PIPED.split('|').map { |abbr| "(?<=\\s|^)#{abbr}" }.join('|')
27
+ SENTENCE_CLOSER = /(?<!#{AWB})\.(?!\.)|[;\?!:]/
28
+ DIRECT_SPEECH_DELIMITER = /['"”]/
29
+ TRAILERS = /\)|<\/.*?>/
30
+
31
+ def segment(string, add_to: nil, **options)
32
+ setup(options)
33
+ # dump whitespace at the beginning and end!
34
+ string.strip!
35
+ sentences = scan_through_string(StringScanner.new(string))
36
+ add_to << sentences if add_to.respond_to?(:<<)
37
+ sentences
38
+ end
39
+
40
+ private
41
+
42
+ def setup(options)
43
+ @indexing = parse_option(:indexing, options)
44
+ @id = 0 if @indexing
45
+
46
+ nl_boundary = parse_option(:newline_boundary, options)
47
+ @sentence_closer = Regexp.union(SENTENCE_CLOSER, /\n{#{nl_boundary}}/)
48
+ end
49
+
50
+ def scan_through_string(scanner, sentences = [])
51
+ while scanner.rest?
52
+ sentence = scanner.scan_until(@sentence_closer) ||
53
+ rescue_no_delimiters(sentences, scanner)
54
+ sentence << trailing_delimiters(scanner)
55
+
56
+ sentence.strip!
57
+ unless sentence.empty?
58
+ curr_id = id
59
+ @logger.log("Segmented #{curr_id} #{sentence}")
60
+ sentences << Sentence.new(sentence, curr_id)
61
+ end
62
+ end
63
+ sentences
64
+ end
65
+
66
+ def id
67
+ if @indexing
68
+ @id += 1
69
+ end
70
+ end
71
+
72
+ def rescue_no_delimiters(sentences, scanner)
73
+ if sentences.any?
74
+ # broken off texts
75
+ scanner.scan_until(/$/)
76
+ else
77
+ # try a simple newline as delimiter, if there was no delimiter
78
+ scanner.reset
79
+ @sentence_closer = /\n/
80
+ if sent = scanner.scan_until(@sentence_closer)
81
+ sent
82
+ else
83
+ # when there is not even a new line, return all input
84
+ scanner.terminate
85
+ scanner.string
86
+ end
87
+ end
88
+ end
89
+
90
+ def trailing_delimiters(scanner)
91
+ trailers = [DIRECT_SPEECH_DELIMITER, TRAILERS]
92
+ trailers.each_with_object('') do |trailer, str|
93
+ str << scanner.scan(trailer).to_s # catches nil
94
+ end
95
+ end
96
+ end
97
+ end
@@ -0,0 +1,10 @@
1
+ require 'llt/core/containable'
2
+
3
+ module LLT
4
+ class Sentence
5
+ include Core::Containable
6
+
7
+ xml_tag 's'
8
+ container_alias :tokens
9
+ end
10
+ end
@@ -0,0 +1,29 @@
1
+ # coding: utf-8
2
+ lib = File.expand_path('../lib', __FILE__)
3
+ $LOAD_PATH.unshift(lib) unless $LOAD_PATH.include?(lib)
4
+ require 'llt/segmenter/version'
5
+
6
+ Gem::Specification.new do |spec|
7
+ spec.name = "llt-segmenter"
8
+ spec.version = LLT::Segmenter::VERSION
9
+ spec.authors = ["Gernot Höflechner, Robert Lichstensteiner, Christof Sirk"]
10
+ spec.email = ["latin.language.toolkit@gmail.com"]
11
+ spec.description = %q{Segments text into sentences}
12
+ spec.summary = %q{Segments text into sentences}
13
+ spec.homepage = "http://latin-language-toolkit.net"
14
+ spec.license = "MIT"
15
+
16
+ spec.files = `git ls-files`.split($/)
17
+ spec.executables = spec.files.grep(%r{^bin/}) { |f| File.basename(f) }
18
+ spec.test_files = spec.files.grep(%r{^(test|spec|features)/})
19
+ spec.require_paths = ["lib"]
20
+
21
+ spec.add_development_dependency "bundler", "~> 1.3"
22
+ spec.add_development_dependency "rake"
23
+ spec.add_development_dependency "rspec"
24
+ spec.add_development_dependency "simplecov", "~> 0.7"
25
+ spec.add_dependency "warbler"
26
+ #spec.add_dependency "llt-core"
27
+ #spec.add_dependency "llt-constants"
28
+ #spec.add_dependency "llt-logger"
29
+ end
@@ -0,0 +1,56 @@
1
+ ENV['RACK_ENV'] = 'test'
2
+
3
+ require 'spec_helper'
4
+ require 'llt/segmenter/api'
5
+ require 'rack/test'
6
+
7
+ def app
8
+ Api
9
+ end
10
+
11
+ describe "segmenter api" do
12
+ include Rack::Test::Methods
13
+
14
+ describe '/segment' do
15
+ context "with URI as input" do
16
+ end
17
+
18
+ let(:text) {{text: "homo mittit. Marcus est."}}
19
+
20
+ context "with text as input" do
21
+ context "with accept header json" do
22
+ it "segments the given sentences" do
23
+ pending
24
+ get '/segment', text,
25
+ {"HTTP_ACCEPT" => "application/json"}
26
+ last_response.should be_ok
27
+ response = last_response.body
28
+ parsed_response = JSON.parse(response)
29
+ parsed_response.should have(2).items
30
+ end
31
+ end
32
+
33
+ context "with accept header xml" do
34
+ it "segments the given sentences" do
35
+ get '/segment', text,
36
+ {"HTTP_ACCEPT" => "application/xml"}
37
+ last_response.should be_ok
38
+ body = last_response.body
39
+ body.should =~ /<s n="1">homo mittit\.<\/s>/
40
+ body.should =~ /<s n="2">Marcus est\.<\/s>/
41
+ end
42
+
43
+ it "receives params for segmentation and markup" do
44
+ params = { indexing: false }.merge(text)
45
+
46
+ get '/segment', params,
47
+ {"HTTP_ACCEPT" => "application/xml"}
48
+ last_response.should be_ok
49
+ body = last_response.body
50
+ body.should =~ /<s>homo mittit\.<\/s>/
51
+ body.should =~ /<s>Marcus est\.<\/s>/
52
+ end
53
+ end
54
+ end
55
+ end
56
+ end
@@ -0,0 +1,259 @@
1
+ require 'spec_helper'
2
+
3
+ describe LLT::Segmenter do
4
+ let(:segmenter) { LLT::Segmenter.new }
5
+ describe "#segment" do
6
+ it "returns an array of LLT::Sentence elements" do
7
+ sentences = segmenter.segment("est.")
8
+ sentences.should have(1).item
9
+ sentences.first.should be_a LLT::Sentence
10
+ end
11
+
12
+ it "segments a paragraph of into sentences - easy" do
13
+ txt = "Cicero est. Caesar est."
14
+ sentences = segmenter.segment(txt)
15
+ sentences.should have(2).items
16
+ sentences[0].to_s.should == "Cicero est."
17
+ sentences[1].to_s.should == "Caesar est."
18
+ end
19
+
20
+ it "segments a paragraph of into sentences - complex" do
21
+ txt = "Cicero est; quis Caesar est? Marcus Antonius!"
22
+ sentences = segmenter.segment(txt)
23
+ sentences.should have(3).items
24
+ sentences[0].to_s.should == "Cicero est;"
25
+ sentences[1].to_s.should == "quis Caesar est?"
26
+ sentences[2].to_s.should == "Marcus Antonius!"
27
+ end
28
+
29
+ it "creates indices by default" do
30
+ txt = "Cicero est; quis Caesar est? Marcus Antonius!"
31
+ sentences = segmenter.segment(txt)
32
+ sentences.map(&:id).should == [1, 2, 3]
33
+ end
34
+
35
+ it "indices can be turned off" do
36
+ txt = "Cicero est; quis Caesar est? Marcus Antonius!"
37
+ sentences = segmenter.segment(txt, indexing: false)
38
+ sentences.map(&:id).should == [nil, nil, nil]
39
+ end
40
+
41
+ it "handles abbreviated names" do
42
+ txt = "C. Caesar est. M. Tullius Cicero est."
43
+ sentences = segmenter.segment(txt)
44
+ sentences.should have(2).items
45
+ sentences[0].to_s.should == "C. Caesar est."
46
+ sentences[1].to_s.should == "M. Tullius Cicero est."
47
+ end
48
+
49
+ it "handles abbreviated dates" do
50
+ txt = "Is dies erat a. d. V Kal. Apr. L. Pisone, A. Gabinio consulibus."
51
+ sentences = segmenter.segment(txt)
52
+ sentences.should have(1).item
53
+ end
54
+
55
+ it "handles more dates" do
56
+ txt = "Is dies erat a. d. V Ian. Non. Feb. L. App. Pisone ."
57
+ sentences = segmenter.segment(txt)
58
+ puts sentences
59
+ sentences.should have(1).item
60
+ end
61
+
62
+ it "are only triggered when they have a leading word boundary" do
63
+ # spec might seem strange, but this didn't work from the start on
64
+ txt = "erat nauta. est."
65
+ sentences = segmenter.segment(txt)
66
+ sentences.should have(2).items
67
+ end
68
+
69
+ it "handles dates even with numbers that have an abbr dot" do
70
+ pending('Not solved yet. Think of M.') do
71
+ txt = "Is dies erat a. d. V. Kal. Apr. L. Pisone, A. Gabinio consulibus."
72
+ sentences = segmenter.segment(txt)
73
+ sentences.should have(1).item
74
+ end
75
+ end
76
+
77
+ it "splits at :" do
78
+ txt = 'iubent: fugere manus.'
79
+ sentences = segmenter.segment(txt)
80
+ sentences.should have(2).items
81
+ end
82
+
83
+ it "doesn't create empty sentences" do
84
+ txt = "text.\n\n\ntext."
85
+ sentences = segmenter.segment(txt)
86
+ sentences.should have(2).items
87
+ end
88
+
89
+ context "with embedded xml" do
90
+ it "doesn't break up before xml closing tags" do
91
+ txt = '<grc> text.</grc>'
92
+ sentences = segmenter.segment(txt)
93
+ sentences.should have(1).item
94
+ end
95
+ end
96
+
97
+ context "newline (\\n) handling" do
98
+ it "works when in between" do
99
+ txt = "Filia est.\nFilius est."
100
+ sentences = segmenter.segment(txt)
101
+ sentences.should have(2).items
102
+ sentences[0].to_s.should == "Filia est."
103
+ sentences[1].to_s.should == "Filius est."
104
+ end
105
+
106
+ it "works when at the end of a text" do
107
+ sentences = segmenter.segment("Marcus est.\n")
108
+ sentences.should have(1).item
109
+ sentences.first.to_s.should == 'Marcus est.'
110
+ end
111
+
112
+ it "works with newline and space in between and no new line at the end" do
113
+ txt = "Fīlius rēgīnae erat.\n Rēgīnam aurō dōnābunt."
114
+ sentences = segmenter.segment(txt)
115
+ sentences.should have(2).items
116
+ sentences[0].to_s.should == "Fīlius rēgīnae erat."
117
+ sentences[1].to_s.should == "Rēgīnam aurō dōnābunt."
118
+ end
119
+
120
+ it "works with newline and space in between and new line at the end" do
121
+ txt = "Fīlius rēgīnae erat nauta.\n Rēgīnam aurō dōnābunt.\n"
122
+ sentences = segmenter.segment(txt)
123
+ sentences.should have(2).items
124
+ sentences[0].to_s.should == "Fīlius rēgīnae erat nauta."
125
+ sentences[1].to_s.should == "Rēgīnam aurō dōnābunt."
126
+ end
127
+
128
+ it "treats an empty line as delimiter - might e.g. appear in book titles" do
129
+ txt = "Marcus est\n\nMarcus est."
130
+ sentences = segmenter.segment(txt)
131
+ sentences.should have(2).item
132
+ end
133
+
134
+ it "number of newlines that count as sentence boundary can be given as option" do
135
+ txt1 = "Marcus est\n\nMarcus est."
136
+ txt2 = "Marcus est\n\n\nMarcus est."
137
+ sentences1 = segmenter.segment(txt1, newline_boundary: 3)
138
+ sentences2 = segmenter.segment(txt2, newline_boundary: 3)
139
+ sentences1.should have(1).item
140
+ sentences2.should have(2).item
141
+ end
142
+ end
143
+
144
+ it "handles quantified texts" do
145
+ txt = "Fēmina puellae pecūniam dabat.\n Fīlia poētae in viīs errābat.\n"
146
+ sentences = segmenter.segment(txt)
147
+ sentences.should have(2).item
148
+ end
149
+
150
+ it "is not disturbed by leading or trailing whitespace" do
151
+ txt = ' Marcus est. Marcus est. '
152
+ sentences = segmenter.segment(txt)
153
+ sentences.should have(2).item
154
+ end
155
+
156
+ context "with ellipsis punctuation" do
157
+ it "handles them at the end of a sentence" do
158
+ txt = 'Marcus ...'
159
+ sentences = segmenter.segment(txt)
160
+ sentences.should have(1).item
161
+ end
162
+
163
+ it "handles them in the midst of a sentence" do
164
+ pending 'Tough to do'
165
+ end
166
+ end
167
+
168
+ context "direct speech delimiter" do
169
+ context "with '" do
170
+ it "handles basic cases when on the outside of the punctuation" do
171
+ txt = "'Marcus est.'"
172
+ sentences = segmenter.segment(txt)
173
+ sentences.should have(1).item
174
+ end
175
+
176
+ it "handles basic cases when on the inside of the punctuation" do
177
+ txt = "'Marcus est'?"
178
+ sentences = segmenter.segment(txt)
179
+ sentences.should have(1).item
180
+ end
181
+ end
182
+
183
+ context 'with "' do
184
+ it "handles basic cases when on the outside of the punctuation" do
185
+ txt = '"Marcus est."'
186
+ sentences = segmenter.segment(txt)
187
+ sentences.should have(1).item
188
+ end
189
+
190
+ it "handles basic cases when on the inside of the punctuation" do
191
+ txt = '"Marcus est"?'
192
+ sentences = segmenter.segment(txt)
193
+ sentences.should have(1).item
194
+ end
195
+ end
196
+
197
+ context 'with ” (attention: this is NOT the same as "' do
198
+ it "handles basic cases when on the outside of the punctuation" do
199
+ txt = '”Marcus est.”'
200
+ sentences = segmenter.segment(txt)
201
+ sentences.should have(1).item
202
+ end
203
+
204
+ it "handles basic cases when on the inside of the punctuation" do
205
+ txt = '”Marcus est”?'
206
+ sentences = segmenter.segment(txt)
207
+ sentences.should have(1).item
208
+ end
209
+ end
210
+ end
211
+
212
+ it "catches trailing parenthesis" do
213
+ txt = "Marcus est. (Marcus est.) Marcus est."
214
+ sentences = segmenter.segment(txt)
215
+ sentences.should have(3).items
216
+ sentences[0].to_s.should == 'Marcus est.'
217
+ sentences[1].to_s.should == '(Marcus est.)'
218
+ sentences[2].to_s.should == 'Marcus est.'
219
+ end
220
+
221
+ it "handles broken off texts - the rest is an own sentence" do
222
+ txt = "Marcus est. Marcus est"
223
+ sentences = segmenter.segment(txt)
224
+ sentences.should have(2).item
225
+ end
226
+
227
+ context "with no delimiters present" do
228
+ it "tries to fallback to single newline boundary" do
229
+ txt = "Marcus est\nMarcus est"
230
+ segmenter.segment(txt).should have(2).items
231
+ end
232
+
233
+ it "returns the whole input as segment when there are no newlines" do
234
+ txt = "Marcus est"
235
+ segmenter.segment(txt).should have(1).item
236
+ end
237
+ end
238
+
239
+ describe "takes an optional keyword argument add_to" do
240
+ class ParagraphDummy
241
+ attr_reader :sentences
242
+ def initialize; @sentences = []; end
243
+ def <<(sentences); @sentences += sentences; end
244
+ end
245
+
246
+ it "adds the result to the given object if #<< is implemented" do
247
+ paragraph = ParagraphDummy.new
248
+ s = segmenter.segment("", add_to: paragraph)
249
+ paragraph.sentences.should == s
250
+ end
251
+
252
+ it "does nothing to the given object when #<< it does not respond to" do
253
+ object = double(respond_to?: false)
254
+ object.should_not receive(:<<)
255
+ segmenter.segment("", add_to: object)
256
+ end
257
+ end
258
+ end
259
+ end
@@ -0,0 +1,26 @@
1
+ require 'simplecov'
2
+ require 'coveralls'
3
+
4
+ Coveralls.wear!
5
+
6
+ SimpleCov.formatter = SimpleCov::Formatter::MultiFormatter[
7
+ SimpleCov::Formatter::HTMLFormatter,
8
+ Coveralls::SimpleCov::Formatter
9
+ ]
10
+
11
+ SimpleCov.start do
12
+ add_filter '/spec/'
13
+ end
14
+
15
+ $LOAD_PATH.unshift File.expand_path('../../lib', __FILE__)
16
+ require 'llt/segmenter'
17
+
18
+ if defined?(LLT::Logger)
19
+ LLT::Logger.level = nil
20
+ end
21
+
22
+ RSpec.configure do |config|
23
+ config.treat_symbols_as_metadata_keys_with_true_values = true
24
+ config.run_all_when_everything_filtered = true
25
+ config.filter_run :focus
26
+ end
metadata ADDED
@@ -0,0 +1,134 @@
1
+ --- !ruby/object:Gem::Specification
2
+ name: llt-segmenter
3
+ version: !ruby/object:Gem::Version
4
+ version: 0.0.1
5
+ platform: ruby
6
+ authors:
7
+ - Gernot Höflechner, Robert Lichstensteiner, Christof Sirk
8
+ autorequire:
9
+ bindir: bin
10
+ cert_chain: []
11
+ date: 2013-12-08 00:00:00.000000000 Z
12
+ dependencies:
13
+ - !ruby/object:Gem::Dependency
14
+ name: bundler
15
+ requirement: !ruby/object:Gem::Requirement
16
+ requirements:
17
+ - - "~>"
18
+ - !ruby/object:Gem::Version
19
+ version: '1.3'
20
+ type: :development
21
+ prerelease: false
22
+ version_requirements: !ruby/object:Gem::Requirement
23
+ requirements:
24
+ - - "~>"
25
+ - !ruby/object:Gem::Version
26
+ version: '1.3'
27
+ - !ruby/object:Gem::Dependency
28
+ name: rake
29
+ requirement: !ruby/object:Gem::Requirement
30
+ requirements:
31
+ - - ">="
32
+ - !ruby/object:Gem::Version
33
+ version: '0'
34
+ type: :development
35
+ prerelease: false
36
+ version_requirements: !ruby/object:Gem::Requirement
37
+ requirements:
38
+ - - ">="
39
+ - !ruby/object:Gem::Version
40
+ version: '0'
41
+ - !ruby/object:Gem::Dependency
42
+ name: rspec
43
+ requirement: !ruby/object:Gem::Requirement
44
+ requirements:
45
+ - - ">="
46
+ - !ruby/object:Gem::Version
47
+ version: '0'
48
+ type: :development
49
+ prerelease: false
50
+ version_requirements: !ruby/object:Gem::Requirement
51
+ requirements:
52
+ - - ">="
53
+ - !ruby/object:Gem::Version
54
+ version: '0'
55
+ - !ruby/object:Gem::Dependency
56
+ name: simplecov
57
+ requirement: !ruby/object:Gem::Requirement
58
+ requirements:
59
+ - - "~>"
60
+ - !ruby/object:Gem::Version
61
+ version: '0.7'
62
+ type: :development
63
+ prerelease: false
64
+ version_requirements: !ruby/object:Gem::Requirement
65
+ requirements:
66
+ - - "~>"
67
+ - !ruby/object:Gem::Version
68
+ version: '0.7'
69
+ - !ruby/object:Gem::Dependency
70
+ name: warbler
71
+ requirement: !ruby/object:Gem::Requirement
72
+ requirements:
73
+ - - ">="
74
+ - !ruby/object:Gem::Version
75
+ version: '0'
76
+ type: :runtime
77
+ prerelease: false
78
+ version_requirements: !ruby/object:Gem::Requirement
79
+ requirements:
80
+ - - ">="
81
+ - !ruby/object:Gem::Version
82
+ version: '0'
83
+ description: Segments text into sentences
84
+ email:
85
+ - latin.language.toolkit@gmail.com
86
+ executables: []
87
+ extensions: []
88
+ extra_rdoc_files: []
89
+ files:
90
+ - ".gitignore"
91
+ - ".rspec"
92
+ - ".travis.yml"
93
+ - Gemfile
94
+ - LICENSE
95
+ - README.md
96
+ - Rakefile
97
+ - config.ru
98
+ - config/warble.rb
99
+ - lib/llt/segmenter.rb
100
+ - lib/llt/segmenter/api.rb
101
+ - lib/llt/segmenter/version.rb
102
+ - lib/llt/sentence.rb
103
+ - llt-segmenter.gemspec
104
+ - spec/lib/llt/segmenter/api_spec.rb
105
+ - spec/lib/llt/segmenter_spec.rb
106
+ - spec/spec_helper.rb
107
+ homepage: http://latin-language-toolkit.net
108
+ licenses:
109
+ - MIT
110
+ metadata: {}
111
+ post_install_message:
112
+ rdoc_options: []
113
+ require_paths:
114
+ - lib
115
+ required_ruby_version: !ruby/object:Gem::Requirement
116
+ requirements:
117
+ - - ">="
118
+ - !ruby/object:Gem::Version
119
+ version: '0'
120
+ required_rubygems_version: !ruby/object:Gem::Requirement
121
+ requirements:
122
+ - - ">="
123
+ - !ruby/object:Gem::Version
124
+ version: '0'
125
+ requirements: []
126
+ rubyforge_project:
127
+ rubygems_version: 2.1.5
128
+ signing_key:
129
+ specification_version: 4
130
+ summary: Segments text into sentences
131
+ test_files:
132
+ - spec/lib/llt/segmenter/api_spec.rb
133
+ - spec/lib/llt/segmenter_spec.rb
134
+ - spec/spec_helper.rb