llt-segmenter 0.0.1
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +7 -0
- data/.gitignore +17 -0
- data/.rspec +3 -0
- data/.travis.yml +6 -0
- data/Gemfile +15 -0
- data/LICENSE +20 -0
- data/README.md +45 -0
- data/Rakefile +7 -0
- data/config/warble.rb +162 -0
- data/config.ru +2 -0
- data/lib/llt/segmenter/api.rb +20 -0
- data/lib/llt/segmenter/version.rb +5 -0
- data/lib/llt/segmenter.rb +97 -0
- data/lib/llt/sentence.rb +10 -0
- data/llt-segmenter.gemspec +29 -0
- data/spec/lib/llt/segmenter/api_spec.rb +56 -0
- data/spec/lib/llt/segmenter_spec.rb +259 -0
- data/spec/spec_helper.rb +26 -0
- metadata +134 -0
checksums.yaml
ADDED
@@ -0,0 +1,7 @@
|
|
1
|
+
---
|
2
|
+
SHA1:
|
3
|
+
metadata.gz: 4759c4666499718fd9364ff782a16f5d7ef818d1
|
4
|
+
data.tar.gz: cb27484995b057c79017ac11740dc68284f0085b
|
5
|
+
SHA512:
|
6
|
+
metadata.gz: abb24d73dc6029e91bb5ba14d81fcd0244ec7cc1619d7848bddff4df785c38dcd1c069877e368930289a248e3e79a905a3d4771e7b7fa6b7087c33427c04d54d
|
7
|
+
data.tar.gz: dd8daa449bb083a62ebd6656c6c6221e2af9c36cb0d22fdd95775b8e781f312125857380abc43a854c6e282cda797f77331afe65f0fd5b2f04d6d5e4d417b389
|
data/.gitignore
ADDED
data/.rspec
ADDED
data/.travis.yml
ADDED
data/Gemfile
ADDED
@@ -0,0 +1,15 @@
|
|
1
|
+
source 'https://rubygems.org'
|
2
|
+
|
3
|
+
# Specify your gem's dependencies in llt-segmenter.gemspec
|
4
|
+
gemspec
|
5
|
+
gem 'pry'
|
6
|
+
|
7
|
+
gem 'coveralls', require: false
|
8
|
+
|
9
|
+
gem 'llt-core', git: 'https://github.com/latin-language-toolkit/llt-core.git'
|
10
|
+
gem 'llt-constants', git: 'https://github.com/latin-language-toolkit/llt-constants.git'
|
11
|
+
gem 'llt-logger', git: 'https://github.com/latin-language-toolkit/llt-logger.git'
|
12
|
+
|
13
|
+
platform :jruby do
|
14
|
+
gem 'jruby-httpclient'
|
15
|
+
end
|
data/LICENSE
ADDED
@@ -0,0 +1,20 @@
|
|
1
|
+
The MIT License (MIT)
|
2
|
+
|
3
|
+
Copyright (c) 2013 latin-language-toolkit
|
4
|
+
|
5
|
+
Permission is hereby granted, free of charge, to any person obtaining a copy of
|
6
|
+
this software and associated documentation files (the "Software"), to deal in
|
7
|
+
the Software without restriction, including without limitation the rights to
|
8
|
+
use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of
|
9
|
+
the Software, and to permit persons to whom the Software is furnished to do so,
|
10
|
+
subject to the following conditions:
|
11
|
+
|
12
|
+
The above copyright notice and this permission notice shall be included in all
|
13
|
+
copies or substantial portions of the Software.
|
14
|
+
|
15
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
16
|
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS
|
17
|
+
FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR
|
18
|
+
COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER
|
19
|
+
IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
|
20
|
+
CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
|
data/README.md
ADDED
@@ -0,0 +1,45 @@
|
|
1
|
+
# LLT::Segmenter
|
2
|
+
|
3
|
+
[![Version](http://allthebadges.io/latin-language-toolkit/llt-segmenter/badge_fury.png)](http://allthebadges.io/latin-language-toolkit/llt-segmenter/badge_fury)
|
4
|
+
[![Build Status](https://travis-ci.org/latin-language-toolkit/llt-segmenter.png?branch=master)](https://travis-ci.org/latin-language-toolkit/llt-segmenter)
|
5
|
+
[![Dependencies](http://allthebadges.io/latin-language-toolkit/llt-segmenter/gemnasium.png)](http://allthebadges.io/latin-language-toolkit/llt-segmenter/gemnasium)
|
6
|
+
[![Coverage](https://coveralls.io/repos/latin-language-toolkit/llt-segmenter/badge.png?branch=master)](https://coveralls.io/r/latin-language-toolkit/llt-segmenter?branch=master)
|
7
|
+
[![Code Climate](https://codeclimate.com/github/latin-language-toolkit/llt-segmenter.png)](https://codeclimate.com/github/latin-language-toolkit/llt-segmenter)
|
8
|
+
|
9
|
+
Segments text into sentences.
|
10
|
+
|
11
|
+
## Installation
|
12
|
+
|
13
|
+
Add this line to your application's Gemfile:
|
14
|
+
|
15
|
+
gem 'llt-segmenter'
|
16
|
+
|
17
|
+
And then execute:
|
18
|
+
|
19
|
+
$ bundle
|
20
|
+
|
21
|
+
Or install it yourself as:
|
22
|
+
|
23
|
+
$ gem install llt-segmenter
|
24
|
+
|
25
|
+
## Usage
|
26
|
+
|
27
|
+
TODO: Write usage instructions here
|
28
|
+
|
29
|
+
## API
|
30
|
+
This currently is a list of requirements and will transform into an API documentation.
|
31
|
+
|
32
|
+
Input:
|
33
|
+
- Text or (URI)
|
34
|
+
- Black-/Whitelist for separators.
|
35
|
+
|
36
|
+
Output:
|
37
|
+
- XML (TEI) or JSON
|
38
|
+
|
39
|
+
## Contributing
|
40
|
+
|
41
|
+
1. Fork it
|
42
|
+
2. Create your feature branch (`git checkout -b my-new-feature`)
|
43
|
+
3. Commit your changes (`git commit -am 'Add some feature'`)
|
44
|
+
4. Push to the branch (`git push origin my-new-feature`)
|
45
|
+
5. Create new Pull Request
|
data/Rakefile
ADDED
data/config/warble.rb
ADDED
@@ -0,0 +1,162 @@
|
|
1
|
+
# Disable Rake-environment-task framework detection by uncommenting/setting to false
|
2
|
+
# Warbler.framework_detection = false
|
3
|
+
|
4
|
+
# Warbler web application assembly configuration file
|
5
|
+
Warbler::Config.new do |config|
|
6
|
+
# Features: additional options controlling how the jar is built.
|
7
|
+
# Currently the following features are supported:
|
8
|
+
# - gemjar: package the gem repository in a jar file in WEB-INF/lib
|
9
|
+
# - executable: embed a web server and make the war executable
|
10
|
+
# - compiled: compile .rb files to .class files
|
11
|
+
# config.features = %w(gemjar)
|
12
|
+
|
13
|
+
# Application directories to be included in the webapp.
|
14
|
+
# config.dirs = %w(app config db lib log script vendor tmp)
|
15
|
+
|
16
|
+
# Additional files/directories to include, above those in config.dirs
|
17
|
+
# config.includes = FileList["db"]
|
18
|
+
|
19
|
+
# Additional files/directories to exclude
|
20
|
+
# config.excludes = FileList["lib/tasks/*"]
|
21
|
+
|
22
|
+
# Additional Java .jar files to include. Note that if .jar files are placed
|
23
|
+
# in lib (and not otherwise excluded) then they need not be mentioned here.
|
24
|
+
# JRuby and JRuby-Rack are pre-loaded in this list. Be sure to include your
|
25
|
+
# own versions if you directly set the value
|
26
|
+
# config.java_libs += FileList["lib/java/*.jar"]
|
27
|
+
|
28
|
+
# Loose Java classes and miscellaneous files to be included.
|
29
|
+
# config.java_classes = FileList["target/classes/**.*"]
|
30
|
+
|
31
|
+
# One or more pathmaps defining how the java classes should be copied into
|
32
|
+
# the archive. The example pathmap below accompanies the java_classes
|
33
|
+
# configuration above. See http://rake.rubyforge.org/classes/String.html#M000017
|
34
|
+
# for details of how to specify a pathmap.
|
35
|
+
# config.pathmaps.java_classes << "%{target/classes/,}p"
|
36
|
+
|
37
|
+
# Bundler support is built-in. If Warbler finds a Gemfile in the
|
38
|
+
# project directory, it will be used to collect the gems to bundle
|
39
|
+
# in your application. If you wish to explicitly disable this
|
40
|
+
# functionality, uncomment here.
|
41
|
+
# config.bundler = false
|
42
|
+
|
43
|
+
# An array of Bundler groups to avoid including in the war file.
|
44
|
+
# Defaults to ["development", "test", "assets"].
|
45
|
+
# config.bundle_without = []
|
46
|
+
|
47
|
+
# Other gems to be included. If you don't use Bundler or a gemspec
|
48
|
+
# file, you need to tell Warbler which gems your application needs
|
49
|
+
# so that they can be packaged in the archive.
|
50
|
+
# For Rails applications, the Rails gems are included by default
|
51
|
+
# unless the vendor/rails directory is present.
|
52
|
+
# config.gems += ["activerecord-jdbcmysql-adapter", "jruby-openssl"]
|
53
|
+
# config.gems << "tzinfo"
|
54
|
+
|
55
|
+
# Uncomment this if you don't want to package rails gem.
|
56
|
+
# config.gems -= ["rails"]
|
57
|
+
|
58
|
+
# The most recent versions of gems are used.
|
59
|
+
# You can specify versions of gems by using a hash assignment:
|
60
|
+
# config.gems["rails"] = "2.3.10"
|
61
|
+
|
62
|
+
# You can also use regexps or Gem::Dependency objects for flexibility or
|
63
|
+
# finer-grained control.
|
64
|
+
# config.gems << /^merb-/
|
65
|
+
# config.gems << Gem::Dependency.new("merb-core", "= 0.9.3")
|
66
|
+
|
67
|
+
# Include gem dependencies not mentioned specifically. Default is
|
68
|
+
# true, uncomment to turn off.
|
69
|
+
# config.gem_dependencies = false
|
70
|
+
|
71
|
+
# Array of regular expressions matching relative paths in gems to be
|
72
|
+
# excluded from the war. Defaults to empty, but you can set it like
|
73
|
+
# below, which excludes test files.
|
74
|
+
# config.gem_excludes = [/^(test|spec)\//]
|
75
|
+
|
76
|
+
# Pathmaps for controlling how application files are copied into the archive
|
77
|
+
# config.pathmaps.application = ["WEB-INF/%p"]
|
78
|
+
|
79
|
+
# Name of the archive (without the extension). Defaults to the basename
|
80
|
+
# of the project directory.
|
81
|
+
# config.jar_name = "mywar"
|
82
|
+
|
83
|
+
# Name of the MANIFEST.MF template for the war file. Defaults to a simple
|
84
|
+
# MANIFEST.MF that contains the version of Warbler used to create the war file.
|
85
|
+
# config.manifest_file = "config/MANIFEST.MF"
|
86
|
+
|
87
|
+
# When using the 'compiled' feature and specified, only these Ruby
|
88
|
+
# files will be compiled. Default is to compile all \.rb files in
|
89
|
+
# the application.
|
90
|
+
# config.compiled_ruby_files = FileList['app/**/*.rb']
|
91
|
+
|
92
|
+
# When set to true, Warbler will override the value of ENV['GEM_HOME'] even it
|
93
|
+
# has already been set. When set to false it will use any existing value of
|
94
|
+
# GEM_HOME if it is set.
|
95
|
+
# config.override_gem_home = true
|
96
|
+
|
97
|
+
# Allows for specifing custom executables
|
98
|
+
# config.executable = ["rake", "bin/rake"]
|
99
|
+
|
100
|
+
# Sets default (prefixed) parameters for the executables
|
101
|
+
# config.executable_params = "do:something"
|
102
|
+
|
103
|
+
# === War files only below here ===
|
104
|
+
|
105
|
+
# Path to the pre-bundled gem directory inside the war file. Default
|
106
|
+
# is 'WEB-INF/gems'. Specify path if gems are already bundled
|
107
|
+
# before running Warbler. This also sets 'gem.path' inside web.xml.
|
108
|
+
# config.gem_path = "WEB-INF/vendor/bundler_gems"
|
109
|
+
|
110
|
+
# Files for WEB-INF directory (next to web.xml). This contains
|
111
|
+
# web.xml by default. If there is an .erb-File it will be processed
|
112
|
+
# with webxml-config. You may want to exclude this file via
|
113
|
+
# config.excludes.
|
114
|
+
# config.webinf_files += FileList["jboss-web.xml"]
|
115
|
+
|
116
|
+
# Files to be included in the root of the webapp. Note that files in public
|
117
|
+
# will have the leading 'public/' part of the path stripped during staging.
|
118
|
+
# config.public_html = FileList["public/**/*", "doc/**/*"]
|
119
|
+
|
120
|
+
# Pathmaps for controlling how public HTML files are copied into the .war
|
121
|
+
# config.pathmaps.public_html = ["%{public/,}p"]
|
122
|
+
|
123
|
+
# Embedded webserver to use with the 'executable' feature. Currently supported
|
124
|
+
# webservers are:
|
125
|
+
# * <tt>winstone</tt> (default) - Winstone 0.9.10 from sourceforge
|
126
|
+
# * <tt>jenkins-ci.winstone</tt> - Improved Winstone from Jenkins CI
|
127
|
+
# * <tt>jetty</tt> - Embedded Jetty from Eclipse
|
128
|
+
# config.webserver = 'jetty'
|
129
|
+
|
130
|
+
# Value of RAILS_ENV for the webapp -- default as shown below
|
131
|
+
# config.webxml.rails.env = ENV['RAILS_ENV'] || 'production'
|
132
|
+
|
133
|
+
# Application booter to use, one of :rack, :rails, or :merb (autodetected by default)
|
134
|
+
# config.webxml.booter = :rails
|
135
|
+
|
136
|
+
# Set JRuby to run in 1.9 mode.
|
137
|
+
# config.webxml.jruby.compat.version = "1.9"
|
138
|
+
|
139
|
+
# When using the :rack booter, "Rackup" script to use.
|
140
|
+
# - For 'rackup.path', the value points to the location of the rackup
|
141
|
+
# script in the web archive file. You need to make sure this file
|
142
|
+
# gets included in the war, possibly by adding it to config.includes
|
143
|
+
# or config.webinf_files above.
|
144
|
+
# - For 'rackup', the rackup script you provide as an inline string
|
145
|
+
# is simply embedded in web.xml.
|
146
|
+
# The script is evaluated in a Rack::Builder to load the application.
|
147
|
+
# Examples:
|
148
|
+
# config.webxml.rackup.path = 'WEB-INF/hello.ru'
|
149
|
+
# config.webxml.rackup = %{require './lib/demo'; run Rack::Adapter::Camping.new(Demo)}
|
150
|
+
# config.webxml.rackup = require 'cgi' && CGI::escapeHTML(File.read("config.ru"))
|
151
|
+
|
152
|
+
# Control the pool of Rails runtimes. Leaving unspecified means
|
153
|
+
# the pool will grow as needed to service requests. It is recommended
|
154
|
+
# that you fix these values when running a production server!
|
155
|
+
# If you're using threadsafe! mode, you probably don't want to set these values,
|
156
|
+
# since 1 runtime(default for threadsafe mode) will be enough.
|
157
|
+
# config.webxml.jruby.min.runtimes = 2
|
158
|
+
# config.webxml.jruby.max.runtimes = 4
|
159
|
+
|
160
|
+
# JNDI data source name
|
161
|
+
# config.webxml.jndi = 'jdbc/rails'
|
162
|
+
end
|
data/config.ru
ADDED
@@ -0,0 +1,20 @@
|
|
1
|
+
require 'sinatra/base'
|
2
|
+
require 'sinatra/respond_with'
|
3
|
+
require 'llt/segmenter'
|
4
|
+
require 'llt/core/api'
|
5
|
+
|
6
|
+
class Api < Sinatra::Base
|
7
|
+
register Sinatra::RespondWith
|
8
|
+
helpers LLT::Core::Api::Helpers
|
9
|
+
|
10
|
+
get '/segment' do
|
11
|
+
typecast_params!(params)
|
12
|
+
text = extract_text(params)
|
13
|
+
segmenter = LLT::Segmenter.new(params)
|
14
|
+
sentences = segmenter.segment(text)
|
15
|
+
|
16
|
+
respond_to do |f|
|
17
|
+
f.xml { to_xml(sentences, params) }
|
18
|
+
end
|
19
|
+
end
|
20
|
+
end
|
@@ -0,0 +1,97 @@
|
|
1
|
+
require "llt/constants"
|
2
|
+
require "llt/core"
|
3
|
+
require "llt/logger"
|
4
|
+
require "llt/sentence"
|
5
|
+
|
6
|
+
module LLT
|
7
|
+
class Segmenter
|
8
|
+
include Constants::Abbreviations
|
9
|
+
include Core::Serviceable
|
10
|
+
|
11
|
+
uses_logger { Logger.new('Segmenter', default: :debug) }
|
12
|
+
|
13
|
+
def self.default_options
|
14
|
+
{
|
15
|
+
indexing: true,
|
16
|
+
newline_boundary: 2
|
17
|
+
}
|
18
|
+
end
|
19
|
+
|
20
|
+
# Abbreviations with boundary e.g. \bA
|
21
|
+
#
|
22
|
+
# This doesn't work in jruby (opened an issue at jruby/jruby#1269 ),
|
23
|
+
# so we have to change things as long as this is not fixed.
|
24
|
+
#
|
25
|
+
# (?<=\s|^) can be just \b in MRI 2.0 and upwards
|
26
|
+
AWB = ALL_ABBRS_PIPED.split('|').map { |abbr| "(?<=\\s|^)#{abbr}" }.join('|')
|
27
|
+
SENTENCE_CLOSER = /(?<!#{AWB})\.(?!\.)|[;\?!:]/
|
28
|
+
DIRECT_SPEECH_DELIMITER = /['"”]/
|
29
|
+
TRAILERS = /\)|<\/.*?>/
|
30
|
+
|
31
|
+
def segment(string, add_to: nil, **options)
|
32
|
+
setup(options)
|
33
|
+
# dump whitespace at the beginning and end!
|
34
|
+
string.strip!
|
35
|
+
sentences = scan_through_string(StringScanner.new(string))
|
36
|
+
add_to << sentences if add_to.respond_to?(:<<)
|
37
|
+
sentences
|
38
|
+
end
|
39
|
+
|
40
|
+
private
|
41
|
+
|
42
|
+
def setup(options)
|
43
|
+
@indexing = parse_option(:indexing, options)
|
44
|
+
@id = 0 if @indexing
|
45
|
+
|
46
|
+
nl_boundary = parse_option(:newline_boundary, options)
|
47
|
+
@sentence_closer = Regexp.union(SENTENCE_CLOSER, /\n{#{nl_boundary}}/)
|
48
|
+
end
|
49
|
+
|
50
|
+
def scan_through_string(scanner, sentences = [])
|
51
|
+
while scanner.rest?
|
52
|
+
sentence = scanner.scan_until(@sentence_closer) ||
|
53
|
+
rescue_no_delimiters(sentences, scanner)
|
54
|
+
sentence << trailing_delimiters(scanner)
|
55
|
+
|
56
|
+
sentence.strip!
|
57
|
+
unless sentence.empty?
|
58
|
+
curr_id = id
|
59
|
+
@logger.log("Segmented #{curr_id} #{sentence}")
|
60
|
+
sentences << Sentence.new(sentence, curr_id)
|
61
|
+
end
|
62
|
+
end
|
63
|
+
sentences
|
64
|
+
end
|
65
|
+
|
66
|
+
def id
|
67
|
+
if @indexing
|
68
|
+
@id += 1
|
69
|
+
end
|
70
|
+
end
|
71
|
+
|
72
|
+
def rescue_no_delimiters(sentences, scanner)
|
73
|
+
if sentences.any?
|
74
|
+
# broken off texts
|
75
|
+
scanner.scan_until(/$/)
|
76
|
+
else
|
77
|
+
# try a simple newline as delimiter, if there was no delimiter
|
78
|
+
scanner.reset
|
79
|
+
@sentence_closer = /\n/
|
80
|
+
if sent = scanner.scan_until(@sentence_closer)
|
81
|
+
sent
|
82
|
+
else
|
83
|
+
# when there is not even a new line, return all input
|
84
|
+
scanner.terminate
|
85
|
+
scanner.string
|
86
|
+
end
|
87
|
+
end
|
88
|
+
end
|
89
|
+
|
90
|
+
def trailing_delimiters(scanner)
|
91
|
+
trailers = [DIRECT_SPEECH_DELIMITER, TRAILERS]
|
92
|
+
trailers.each_with_object('') do |trailer, str|
|
93
|
+
str << scanner.scan(trailer).to_s # catches nil
|
94
|
+
end
|
95
|
+
end
|
96
|
+
end
|
97
|
+
end
|
data/lib/llt/sentence.rb
ADDED
@@ -0,0 +1,29 @@
|
|
1
|
+
# coding: utf-8
|
2
|
+
lib = File.expand_path('../lib', __FILE__)
|
3
|
+
$LOAD_PATH.unshift(lib) unless $LOAD_PATH.include?(lib)
|
4
|
+
require 'llt/segmenter/version'
|
5
|
+
|
6
|
+
Gem::Specification.new do |spec|
|
7
|
+
spec.name = "llt-segmenter"
|
8
|
+
spec.version = LLT::Segmenter::VERSION
|
9
|
+
spec.authors = ["Gernot Höflechner, Robert Lichstensteiner, Christof Sirk"]
|
10
|
+
spec.email = ["latin.language.toolkit@gmail.com"]
|
11
|
+
spec.description = %q{Segments text into sentences}
|
12
|
+
spec.summary = %q{Segments text into sentences}
|
13
|
+
spec.homepage = "http://latin-language-toolkit.net"
|
14
|
+
spec.license = "MIT"
|
15
|
+
|
16
|
+
spec.files = `git ls-files`.split($/)
|
17
|
+
spec.executables = spec.files.grep(%r{^bin/}) { |f| File.basename(f) }
|
18
|
+
spec.test_files = spec.files.grep(%r{^(test|spec|features)/})
|
19
|
+
spec.require_paths = ["lib"]
|
20
|
+
|
21
|
+
spec.add_development_dependency "bundler", "~> 1.3"
|
22
|
+
spec.add_development_dependency "rake"
|
23
|
+
spec.add_development_dependency "rspec"
|
24
|
+
spec.add_development_dependency "simplecov", "~> 0.7"
|
25
|
+
spec.add_dependency "warbler"
|
26
|
+
#spec.add_dependency "llt-core"
|
27
|
+
#spec.add_dependency "llt-constants"
|
28
|
+
#spec.add_dependency "llt-logger"
|
29
|
+
end
|
@@ -0,0 +1,56 @@
|
|
1
|
+
ENV['RACK_ENV'] = 'test'
|
2
|
+
|
3
|
+
require 'spec_helper'
|
4
|
+
require 'llt/segmenter/api'
|
5
|
+
require 'rack/test'
|
6
|
+
|
7
|
+
def app
|
8
|
+
Api
|
9
|
+
end
|
10
|
+
|
11
|
+
describe "segmenter api" do
|
12
|
+
include Rack::Test::Methods
|
13
|
+
|
14
|
+
describe '/segment' do
|
15
|
+
context "with URI as input" do
|
16
|
+
end
|
17
|
+
|
18
|
+
let(:text) {{text: "homo mittit. Marcus est."}}
|
19
|
+
|
20
|
+
context "with text as input" do
|
21
|
+
context "with accept header json" do
|
22
|
+
it "segments the given sentences" do
|
23
|
+
pending
|
24
|
+
get '/segment', text,
|
25
|
+
{"HTTP_ACCEPT" => "application/json"}
|
26
|
+
last_response.should be_ok
|
27
|
+
response = last_response.body
|
28
|
+
parsed_response = JSON.parse(response)
|
29
|
+
parsed_response.should have(2).items
|
30
|
+
end
|
31
|
+
end
|
32
|
+
|
33
|
+
context "with accept header xml" do
|
34
|
+
it "segments the given sentences" do
|
35
|
+
get '/segment', text,
|
36
|
+
{"HTTP_ACCEPT" => "application/xml"}
|
37
|
+
last_response.should be_ok
|
38
|
+
body = last_response.body
|
39
|
+
body.should =~ /<s n="1">homo mittit\.<\/s>/
|
40
|
+
body.should =~ /<s n="2">Marcus est\.<\/s>/
|
41
|
+
end
|
42
|
+
|
43
|
+
it "receives params for segmentation and markup" do
|
44
|
+
params = { indexing: false }.merge(text)
|
45
|
+
|
46
|
+
get '/segment', params,
|
47
|
+
{"HTTP_ACCEPT" => "application/xml"}
|
48
|
+
last_response.should be_ok
|
49
|
+
body = last_response.body
|
50
|
+
body.should =~ /<s>homo mittit\.<\/s>/
|
51
|
+
body.should =~ /<s>Marcus est\.<\/s>/
|
52
|
+
end
|
53
|
+
end
|
54
|
+
end
|
55
|
+
end
|
56
|
+
end
|
@@ -0,0 +1,259 @@
|
|
1
|
+
require 'spec_helper'
|
2
|
+
|
3
|
+
describe LLT::Segmenter do
|
4
|
+
let(:segmenter) { LLT::Segmenter.new }
|
5
|
+
describe "#segment" do
|
6
|
+
it "returns an array of LLT::Sentence elements" do
|
7
|
+
sentences = segmenter.segment("est.")
|
8
|
+
sentences.should have(1).item
|
9
|
+
sentences.first.should be_a LLT::Sentence
|
10
|
+
end
|
11
|
+
|
12
|
+
it "segments a paragraph of into sentences - easy" do
|
13
|
+
txt = "Cicero est. Caesar est."
|
14
|
+
sentences = segmenter.segment(txt)
|
15
|
+
sentences.should have(2).items
|
16
|
+
sentences[0].to_s.should == "Cicero est."
|
17
|
+
sentences[1].to_s.should == "Caesar est."
|
18
|
+
end
|
19
|
+
|
20
|
+
it "segments a paragraph of into sentences - complex" do
|
21
|
+
txt = "Cicero est; quis Caesar est? Marcus Antonius!"
|
22
|
+
sentences = segmenter.segment(txt)
|
23
|
+
sentences.should have(3).items
|
24
|
+
sentences[0].to_s.should == "Cicero est;"
|
25
|
+
sentences[1].to_s.should == "quis Caesar est?"
|
26
|
+
sentences[2].to_s.should == "Marcus Antonius!"
|
27
|
+
end
|
28
|
+
|
29
|
+
it "creates indices by default" do
|
30
|
+
txt = "Cicero est; quis Caesar est? Marcus Antonius!"
|
31
|
+
sentences = segmenter.segment(txt)
|
32
|
+
sentences.map(&:id).should == [1, 2, 3]
|
33
|
+
end
|
34
|
+
|
35
|
+
it "indices can be turned off" do
|
36
|
+
txt = "Cicero est; quis Caesar est? Marcus Antonius!"
|
37
|
+
sentences = segmenter.segment(txt, indexing: false)
|
38
|
+
sentences.map(&:id).should == [nil, nil, nil]
|
39
|
+
end
|
40
|
+
|
41
|
+
it "handles abbreviated names" do
|
42
|
+
txt = "C. Caesar est. M. Tullius Cicero est."
|
43
|
+
sentences = segmenter.segment(txt)
|
44
|
+
sentences.should have(2).items
|
45
|
+
sentences[0].to_s.should == "C. Caesar est."
|
46
|
+
sentences[1].to_s.should == "M. Tullius Cicero est."
|
47
|
+
end
|
48
|
+
|
49
|
+
it "handles abbreviated dates" do
|
50
|
+
txt = "Is dies erat a. d. V Kal. Apr. L. Pisone, A. Gabinio consulibus."
|
51
|
+
sentences = segmenter.segment(txt)
|
52
|
+
sentences.should have(1).item
|
53
|
+
end
|
54
|
+
|
55
|
+
it "handles more dates" do
|
56
|
+
txt = "Is dies erat a. d. V Ian. Non. Feb. L. App. Pisone ."
|
57
|
+
sentences = segmenter.segment(txt)
|
58
|
+
puts sentences
|
59
|
+
sentences.should have(1).item
|
60
|
+
end
|
61
|
+
|
62
|
+
it "are only triggered when they have a leading word boundary" do
|
63
|
+
# spec might seem strange, but this didn't work from the start on
|
64
|
+
txt = "erat nauta. est."
|
65
|
+
sentences = segmenter.segment(txt)
|
66
|
+
sentences.should have(2).items
|
67
|
+
end
|
68
|
+
|
69
|
+
it "handles dates even with numbers that have an abbr dot" do
|
70
|
+
pending('Not solved yet. Think of M.') do
|
71
|
+
txt = "Is dies erat a. d. V. Kal. Apr. L. Pisone, A. Gabinio consulibus."
|
72
|
+
sentences = segmenter.segment(txt)
|
73
|
+
sentences.should have(1).item
|
74
|
+
end
|
75
|
+
end
|
76
|
+
|
77
|
+
it "splits at :" do
|
78
|
+
txt = 'iubent: fugere manus.'
|
79
|
+
sentences = segmenter.segment(txt)
|
80
|
+
sentences.should have(2).items
|
81
|
+
end
|
82
|
+
|
83
|
+
it "doesn't create empty sentences" do
|
84
|
+
txt = "text.\n\n\ntext."
|
85
|
+
sentences = segmenter.segment(txt)
|
86
|
+
sentences.should have(2).items
|
87
|
+
end
|
88
|
+
|
89
|
+
context "with embedded xml" do
|
90
|
+
it "doesn't break up before xml closing tags" do
|
91
|
+
txt = '<grc> text.</grc>'
|
92
|
+
sentences = segmenter.segment(txt)
|
93
|
+
sentences.should have(1).item
|
94
|
+
end
|
95
|
+
end
|
96
|
+
|
97
|
+
context "newline (\\n) handling" do
|
98
|
+
it "works when in between" do
|
99
|
+
txt = "Filia est.\nFilius est."
|
100
|
+
sentences = segmenter.segment(txt)
|
101
|
+
sentences.should have(2).items
|
102
|
+
sentences[0].to_s.should == "Filia est."
|
103
|
+
sentences[1].to_s.should == "Filius est."
|
104
|
+
end
|
105
|
+
|
106
|
+
it "works when at the end of a text" do
|
107
|
+
sentences = segmenter.segment("Marcus est.\n")
|
108
|
+
sentences.should have(1).item
|
109
|
+
sentences.first.to_s.should == 'Marcus est.'
|
110
|
+
end
|
111
|
+
|
112
|
+
it "works with newline and space in between and no new line at the end" do
|
113
|
+
txt = "Fīlius rēgīnae erat.\n Rēgīnam aurō dōnābunt."
|
114
|
+
sentences = segmenter.segment(txt)
|
115
|
+
sentences.should have(2).items
|
116
|
+
sentences[0].to_s.should == "Fīlius rēgīnae erat."
|
117
|
+
sentences[1].to_s.should == "Rēgīnam aurō dōnābunt."
|
118
|
+
end
|
119
|
+
|
120
|
+
it "works with newline and space in between and new line at the end" do
|
121
|
+
txt = "Fīlius rēgīnae erat nauta.\n Rēgīnam aurō dōnābunt.\n"
|
122
|
+
sentences = segmenter.segment(txt)
|
123
|
+
sentences.should have(2).items
|
124
|
+
sentences[0].to_s.should == "Fīlius rēgīnae erat nauta."
|
125
|
+
sentences[1].to_s.should == "Rēgīnam aurō dōnābunt."
|
126
|
+
end
|
127
|
+
|
128
|
+
it "treats an empty line as delimiter - might e.g. appear in book titles" do
|
129
|
+
txt = "Marcus est\n\nMarcus est."
|
130
|
+
sentences = segmenter.segment(txt)
|
131
|
+
sentences.should have(2).item
|
132
|
+
end
|
133
|
+
|
134
|
+
it "number of newlines that count as sentence boundary can be given as option" do
|
135
|
+
txt1 = "Marcus est\n\nMarcus est."
|
136
|
+
txt2 = "Marcus est\n\n\nMarcus est."
|
137
|
+
sentences1 = segmenter.segment(txt1, newline_boundary: 3)
|
138
|
+
sentences2 = segmenter.segment(txt2, newline_boundary: 3)
|
139
|
+
sentences1.should have(1).item
|
140
|
+
sentences2.should have(2).item
|
141
|
+
end
|
142
|
+
end
|
143
|
+
|
144
|
+
it "handles quantified texts" do
|
145
|
+
txt = "Fēmina puellae pecūniam dabat.\n Fīlia poētae in viīs errābat.\n"
|
146
|
+
sentences = segmenter.segment(txt)
|
147
|
+
sentences.should have(2).item
|
148
|
+
end
|
149
|
+
|
150
|
+
it "is not disturbed by leading or trailing whitespace" do
|
151
|
+
txt = ' Marcus est. Marcus est. '
|
152
|
+
sentences = segmenter.segment(txt)
|
153
|
+
sentences.should have(2).item
|
154
|
+
end
|
155
|
+
|
156
|
+
context "with ellipsis punctuation" do
|
157
|
+
it "handles them at the end of a sentence" do
|
158
|
+
txt = 'Marcus ...'
|
159
|
+
sentences = segmenter.segment(txt)
|
160
|
+
sentences.should have(1).item
|
161
|
+
end
|
162
|
+
|
163
|
+
it "handles them in the midst of a sentence" do
|
164
|
+
pending 'Tough to do'
|
165
|
+
end
|
166
|
+
end
|
167
|
+
|
168
|
+
context "direct speech delimiter" do
|
169
|
+
context "with '" do
|
170
|
+
it "handles basic cases when on the outside of the punctuation" do
|
171
|
+
txt = "'Marcus est.'"
|
172
|
+
sentences = segmenter.segment(txt)
|
173
|
+
sentences.should have(1).item
|
174
|
+
end
|
175
|
+
|
176
|
+
it "handles basic cases when on the inside of the punctuation" do
|
177
|
+
txt = "'Marcus est'?"
|
178
|
+
sentences = segmenter.segment(txt)
|
179
|
+
sentences.should have(1).item
|
180
|
+
end
|
181
|
+
end
|
182
|
+
|
183
|
+
context 'with "' do
|
184
|
+
it "handles basic cases when on the outside of the punctuation" do
|
185
|
+
txt = '"Marcus est."'
|
186
|
+
sentences = segmenter.segment(txt)
|
187
|
+
sentences.should have(1).item
|
188
|
+
end
|
189
|
+
|
190
|
+
it "handles basic cases when on the inside of the punctuation" do
|
191
|
+
txt = '"Marcus est"?'
|
192
|
+
sentences = segmenter.segment(txt)
|
193
|
+
sentences.should have(1).item
|
194
|
+
end
|
195
|
+
end
|
196
|
+
|
197
|
+
context 'with ” (attention: this is NOT the same as "' do
|
198
|
+
it "handles basic cases when on the outside of the punctuation" do
|
199
|
+
txt = '”Marcus est.”'
|
200
|
+
sentences = segmenter.segment(txt)
|
201
|
+
sentences.should have(1).item
|
202
|
+
end
|
203
|
+
|
204
|
+
it "handles basic cases when on the inside of the punctuation" do
|
205
|
+
txt = '”Marcus est”?'
|
206
|
+
sentences = segmenter.segment(txt)
|
207
|
+
sentences.should have(1).item
|
208
|
+
end
|
209
|
+
end
|
210
|
+
end
|
211
|
+
|
212
|
+
it "catches trailing parenthesis" do
|
213
|
+
txt = "Marcus est. (Marcus est.) Marcus est."
|
214
|
+
sentences = segmenter.segment(txt)
|
215
|
+
sentences.should have(3).items
|
216
|
+
sentences[0].to_s.should == 'Marcus est.'
|
217
|
+
sentences[1].to_s.should == '(Marcus est.)'
|
218
|
+
sentences[2].to_s.should == 'Marcus est.'
|
219
|
+
end
|
220
|
+
|
221
|
+
it "handles broken off texts - the rest is an own sentence" do
|
222
|
+
txt = "Marcus est. Marcus est"
|
223
|
+
sentences = segmenter.segment(txt)
|
224
|
+
sentences.should have(2).item
|
225
|
+
end
|
226
|
+
|
227
|
+
context "with no delimiters present" do
|
228
|
+
it "tries to fallback to single newline boundary" do
|
229
|
+
txt = "Marcus est\nMarcus est"
|
230
|
+
segmenter.segment(txt).should have(2).items
|
231
|
+
end
|
232
|
+
|
233
|
+
it "returns the whole input as segment when there are no newlines" do
|
234
|
+
txt = "Marcus est"
|
235
|
+
segmenter.segment(txt).should have(1).item
|
236
|
+
end
|
237
|
+
end
|
238
|
+
|
239
|
+
describe "takes an optional keyword argument add_to" do
|
240
|
+
class ParagraphDummy
|
241
|
+
attr_reader :sentences
|
242
|
+
def initialize; @sentences = []; end
|
243
|
+
def <<(sentences); @sentences += sentences; end
|
244
|
+
end
|
245
|
+
|
246
|
+
it "adds the result to the given object if #<< is implemented" do
|
247
|
+
paragraph = ParagraphDummy.new
|
248
|
+
s = segmenter.segment("", add_to: paragraph)
|
249
|
+
paragraph.sentences.should == s
|
250
|
+
end
|
251
|
+
|
252
|
+
it "does nothing to the given object when #<< it does not respond to" do
|
253
|
+
object = double(respond_to?: false)
|
254
|
+
object.should_not receive(:<<)
|
255
|
+
segmenter.segment("", add_to: object)
|
256
|
+
end
|
257
|
+
end
|
258
|
+
end
|
259
|
+
end
|
data/spec/spec_helper.rb
ADDED
@@ -0,0 +1,26 @@
|
|
1
|
+
require 'simplecov'
|
2
|
+
require 'coveralls'
|
3
|
+
|
4
|
+
Coveralls.wear!
|
5
|
+
|
6
|
+
SimpleCov.formatter = SimpleCov::Formatter::MultiFormatter[
|
7
|
+
SimpleCov::Formatter::HTMLFormatter,
|
8
|
+
Coveralls::SimpleCov::Formatter
|
9
|
+
]
|
10
|
+
|
11
|
+
SimpleCov.start do
|
12
|
+
add_filter '/spec/'
|
13
|
+
end
|
14
|
+
|
15
|
+
$LOAD_PATH.unshift File.expand_path('../../lib', __FILE__)
|
16
|
+
require 'llt/segmenter'
|
17
|
+
|
18
|
+
if defined?(LLT::Logger)
|
19
|
+
LLT::Logger.level = nil
|
20
|
+
end
|
21
|
+
|
22
|
+
RSpec.configure do |config|
|
23
|
+
config.treat_symbols_as_metadata_keys_with_true_values = true
|
24
|
+
config.run_all_when_everything_filtered = true
|
25
|
+
config.filter_run :focus
|
26
|
+
end
|
metadata
ADDED
@@ -0,0 +1,134 @@
|
|
1
|
+
--- !ruby/object:Gem::Specification
|
2
|
+
name: llt-segmenter
|
3
|
+
version: !ruby/object:Gem::Version
|
4
|
+
version: 0.0.1
|
5
|
+
platform: ruby
|
6
|
+
authors:
|
7
|
+
- Gernot Höflechner, Robert Lichstensteiner, Christof Sirk
|
8
|
+
autorequire:
|
9
|
+
bindir: bin
|
10
|
+
cert_chain: []
|
11
|
+
date: 2013-12-08 00:00:00.000000000 Z
|
12
|
+
dependencies:
|
13
|
+
- !ruby/object:Gem::Dependency
|
14
|
+
name: bundler
|
15
|
+
requirement: !ruby/object:Gem::Requirement
|
16
|
+
requirements:
|
17
|
+
- - "~>"
|
18
|
+
- !ruby/object:Gem::Version
|
19
|
+
version: '1.3'
|
20
|
+
type: :development
|
21
|
+
prerelease: false
|
22
|
+
version_requirements: !ruby/object:Gem::Requirement
|
23
|
+
requirements:
|
24
|
+
- - "~>"
|
25
|
+
- !ruby/object:Gem::Version
|
26
|
+
version: '1.3'
|
27
|
+
- !ruby/object:Gem::Dependency
|
28
|
+
name: rake
|
29
|
+
requirement: !ruby/object:Gem::Requirement
|
30
|
+
requirements:
|
31
|
+
- - ">="
|
32
|
+
- !ruby/object:Gem::Version
|
33
|
+
version: '0'
|
34
|
+
type: :development
|
35
|
+
prerelease: false
|
36
|
+
version_requirements: !ruby/object:Gem::Requirement
|
37
|
+
requirements:
|
38
|
+
- - ">="
|
39
|
+
- !ruby/object:Gem::Version
|
40
|
+
version: '0'
|
41
|
+
- !ruby/object:Gem::Dependency
|
42
|
+
name: rspec
|
43
|
+
requirement: !ruby/object:Gem::Requirement
|
44
|
+
requirements:
|
45
|
+
- - ">="
|
46
|
+
- !ruby/object:Gem::Version
|
47
|
+
version: '0'
|
48
|
+
type: :development
|
49
|
+
prerelease: false
|
50
|
+
version_requirements: !ruby/object:Gem::Requirement
|
51
|
+
requirements:
|
52
|
+
- - ">="
|
53
|
+
- !ruby/object:Gem::Version
|
54
|
+
version: '0'
|
55
|
+
- !ruby/object:Gem::Dependency
|
56
|
+
name: simplecov
|
57
|
+
requirement: !ruby/object:Gem::Requirement
|
58
|
+
requirements:
|
59
|
+
- - "~>"
|
60
|
+
- !ruby/object:Gem::Version
|
61
|
+
version: '0.7'
|
62
|
+
type: :development
|
63
|
+
prerelease: false
|
64
|
+
version_requirements: !ruby/object:Gem::Requirement
|
65
|
+
requirements:
|
66
|
+
- - "~>"
|
67
|
+
- !ruby/object:Gem::Version
|
68
|
+
version: '0.7'
|
69
|
+
- !ruby/object:Gem::Dependency
|
70
|
+
name: warbler
|
71
|
+
requirement: !ruby/object:Gem::Requirement
|
72
|
+
requirements:
|
73
|
+
- - ">="
|
74
|
+
- !ruby/object:Gem::Version
|
75
|
+
version: '0'
|
76
|
+
type: :runtime
|
77
|
+
prerelease: false
|
78
|
+
version_requirements: !ruby/object:Gem::Requirement
|
79
|
+
requirements:
|
80
|
+
- - ">="
|
81
|
+
- !ruby/object:Gem::Version
|
82
|
+
version: '0'
|
83
|
+
description: Segments text into sentences
|
84
|
+
email:
|
85
|
+
- latin.language.toolkit@gmail.com
|
86
|
+
executables: []
|
87
|
+
extensions: []
|
88
|
+
extra_rdoc_files: []
|
89
|
+
files:
|
90
|
+
- ".gitignore"
|
91
|
+
- ".rspec"
|
92
|
+
- ".travis.yml"
|
93
|
+
- Gemfile
|
94
|
+
- LICENSE
|
95
|
+
- README.md
|
96
|
+
- Rakefile
|
97
|
+
- config.ru
|
98
|
+
- config/warble.rb
|
99
|
+
- lib/llt/segmenter.rb
|
100
|
+
- lib/llt/segmenter/api.rb
|
101
|
+
- lib/llt/segmenter/version.rb
|
102
|
+
- lib/llt/sentence.rb
|
103
|
+
- llt-segmenter.gemspec
|
104
|
+
- spec/lib/llt/segmenter/api_spec.rb
|
105
|
+
- spec/lib/llt/segmenter_spec.rb
|
106
|
+
- spec/spec_helper.rb
|
107
|
+
homepage: http://latin-language-toolkit.net
|
108
|
+
licenses:
|
109
|
+
- MIT
|
110
|
+
metadata: {}
|
111
|
+
post_install_message:
|
112
|
+
rdoc_options: []
|
113
|
+
require_paths:
|
114
|
+
- lib
|
115
|
+
required_ruby_version: !ruby/object:Gem::Requirement
|
116
|
+
requirements:
|
117
|
+
- - ">="
|
118
|
+
- !ruby/object:Gem::Version
|
119
|
+
version: '0'
|
120
|
+
required_rubygems_version: !ruby/object:Gem::Requirement
|
121
|
+
requirements:
|
122
|
+
- - ">="
|
123
|
+
- !ruby/object:Gem::Version
|
124
|
+
version: '0'
|
125
|
+
requirements: []
|
126
|
+
rubyforge_project:
|
127
|
+
rubygems_version: 2.1.5
|
128
|
+
signing_key:
|
129
|
+
specification_version: 4
|
130
|
+
summary: Segments text into sentences
|
131
|
+
test_files:
|
132
|
+
- spec/lib/llt/segmenter/api_spec.rb
|
133
|
+
- spec/lib/llt/segmenter_spec.rb
|
134
|
+
- spec/spec_helper.rb
|