llt-segmenter 0.0.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +7 -0
- data/.gitignore +17 -0
- data/.rspec +3 -0
- data/.travis.yml +6 -0
- data/Gemfile +15 -0
- data/LICENSE +20 -0
- data/README.md +45 -0
- data/Rakefile +7 -0
- data/config/warble.rb +162 -0
- data/config.ru +2 -0
- data/lib/llt/segmenter/api.rb +20 -0
- data/lib/llt/segmenter/version.rb +5 -0
- data/lib/llt/segmenter.rb +97 -0
- data/lib/llt/sentence.rb +10 -0
- data/llt-segmenter.gemspec +29 -0
- data/spec/lib/llt/segmenter/api_spec.rb +56 -0
- data/spec/lib/llt/segmenter_spec.rb +259 -0
- data/spec/spec_helper.rb +26 -0
- metadata +134 -0
checksums.yaml
ADDED
@@ -0,0 +1,7 @@
|
|
1
|
+
---
|
2
|
+
SHA1:
|
3
|
+
metadata.gz: 4759c4666499718fd9364ff782a16f5d7ef818d1
|
4
|
+
data.tar.gz: cb27484995b057c79017ac11740dc68284f0085b
|
5
|
+
SHA512:
|
6
|
+
metadata.gz: abb24d73dc6029e91bb5ba14d81fcd0244ec7cc1619d7848bddff4df785c38dcd1c069877e368930289a248e3e79a905a3d4771e7b7fa6b7087c33427c04d54d
|
7
|
+
data.tar.gz: dd8daa449bb083a62ebd6656c6c6221e2af9c36cb0d22fdd95775b8e781f312125857380abc43a854c6e282cda797f77331afe65f0fd5b2f04d6d5e4d417b389
|
data/.gitignore
ADDED
data/.rspec
ADDED
data/.travis.yml
ADDED
data/Gemfile
ADDED
@@ -0,0 +1,15 @@
|
|
1
|
+
source 'https://rubygems.org'
|
2
|
+
|
3
|
+
# Specify your gem's dependencies in llt-segmenter.gemspec
|
4
|
+
gemspec
|
5
|
+
gem 'pry'
|
6
|
+
|
7
|
+
gem 'coveralls', require: false
|
8
|
+
|
9
|
+
gem 'llt-core', git: 'https://github.com/latin-language-toolkit/llt-core.git'
|
10
|
+
gem 'llt-constants', git: 'https://github.com/latin-language-toolkit/llt-constants.git'
|
11
|
+
gem 'llt-logger', git: 'https://github.com/latin-language-toolkit/llt-logger.git'
|
12
|
+
|
13
|
+
platform :jruby do
|
14
|
+
gem 'jruby-httpclient'
|
15
|
+
end
|
data/LICENSE
ADDED
@@ -0,0 +1,20 @@
|
|
1
|
+
The MIT License (MIT)
|
2
|
+
|
3
|
+
Copyright (c) 2013 latin-language-toolkit
|
4
|
+
|
5
|
+
Permission is hereby granted, free of charge, to any person obtaining a copy of
|
6
|
+
this software and associated documentation files (the "Software"), to deal in
|
7
|
+
the Software without restriction, including without limitation the rights to
|
8
|
+
use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of
|
9
|
+
the Software, and to permit persons to whom the Software is furnished to do so,
|
10
|
+
subject to the following conditions:
|
11
|
+
|
12
|
+
The above copyright notice and this permission notice shall be included in all
|
13
|
+
copies or substantial portions of the Software.
|
14
|
+
|
15
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
16
|
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS
|
17
|
+
FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR
|
18
|
+
COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER
|
19
|
+
IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
|
20
|
+
CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
|
data/README.md
ADDED
@@ -0,0 +1,45 @@
|
|
1
|
+
# LLT::Segmenter
|
2
|
+
|
3
|
+
[](http://allthebadges.io/latin-language-toolkit/llt-segmenter/badge_fury)
|
4
|
+
[](https://travis-ci.org/latin-language-toolkit/llt-segmenter)
|
5
|
+
[](http://allthebadges.io/latin-language-toolkit/llt-segmenter/gemnasium)
|
6
|
+
[](https://coveralls.io/r/latin-language-toolkit/llt-segmenter?branch=master)
|
7
|
+
[](https://codeclimate.com/github/latin-language-toolkit/llt-segmenter)
|
8
|
+
|
9
|
+
Segments text into sentences.
|
10
|
+
|
11
|
+
## Installation
|
12
|
+
|
13
|
+
Add this line to your application's Gemfile:
|
14
|
+
|
15
|
+
gem 'llt-segmenter'
|
16
|
+
|
17
|
+
And then execute:
|
18
|
+
|
19
|
+
$ bundle
|
20
|
+
|
21
|
+
Or install it yourself as:
|
22
|
+
|
23
|
+
$ gem install llt-segmenter
|
24
|
+
|
25
|
+
## Usage
|
26
|
+
|
27
|
+
TODO: Write usage instructions here
|
28
|
+
|
29
|
+
## API
|
30
|
+
This currently is a list of requirements and will transform into an API documentation.
|
31
|
+
|
32
|
+
Input:
|
33
|
+
- Text or (URI)
|
34
|
+
- Black-/Whitelist for separators.
|
35
|
+
|
36
|
+
Output:
|
37
|
+
- XML (TEI) or JSON
|
38
|
+
|
39
|
+
## Contributing
|
40
|
+
|
41
|
+
1. Fork it
|
42
|
+
2. Create your feature branch (`git checkout -b my-new-feature`)
|
43
|
+
3. Commit your changes (`git commit -am 'Add some feature'`)
|
44
|
+
4. Push to the branch (`git push origin my-new-feature`)
|
45
|
+
5. Create new Pull Request
|
data/Rakefile
ADDED
data/config/warble.rb
ADDED
@@ -0,0 +1,162 @@
|
|
1
|
+
# Disable Rake-environment-task framework detection by uncommenting/setting to false
|
2
|
+
# Warbler.framework_detection = false
|
3
|
+
|
4
|
+
# Warbler web application assembly configuration file
|
5
|
+
Warbler::Config.new do |config|
|
6
|
+
# Features: additional options controlling how the jar is built.
|
7
|
+
# Currently the following features are supported:
|
8
|
+
# - gemjar: package the gem repository in a jar file in WEB-INF/lib
|
9
|
+
# - executable: embed a web server and make the war executable
|
10
|
+
# - compiled: compile .rb files to .class files
|
11
|
+
# config.features = %w(gemjar)
|
12
|
+
|
13
|
+
# Application directories to be included in the webapp.
|
14
|
+
# config.dirs = %w(app config db lib log script vendor tmp)
|
15
|
+
|
16
|
+
# Additional files/directories to include, above those in config.dirs
|
17
|
+
# config.includes = FileList["db"]
|
18
|
+
|
19
|
+
# Additional files/directories to exclude
|
20
|
+
# config.excludes = FileList["lib/tasks/*"]
|
21
|
+
|
22
|
+
# Additional Java .jar files to include. Note that if .jar files are placed
|
23
|
+
# in lib (and not otherwise excluded) then they need not be mentioned here.
|
24
|
+
# JRuby and JRuby-Rack are pre-loaded in this list. Be sure to include your
|
25
|
+
# own versions if you directly set the value
|
26
|
+
# config.java_libs += FileList["lib/java/*.jar"]
|
27
|
+
|
28
|
+
# Loose Java classes and miscellaneous files to be included.
|
29
|
+
# config.java_classes = FileList["target/classes/**.*"]
|
30
|
+
|
31
|
+
# One or more pathmaps defining how the java classes should be copied into
|
32
|
+
# the archive. The example pathmap below accompanies the java_classes
|
33
|
+
# configuration above. See http://rake.rubyforge.org/classes/String.html#M000017
|
34
|
+
# for details of how to specify a pathmap.
|
35
|
+
# config.pathmaps.java_classes << "%{target/classes/,}p"
|
36
|
+
|
37
|
+
# Bundler support is built-in. If Warbler finds a Gemfile in the
|
38
|
+
# project directory, it will be used to collect the gems to bundle
|
39
|
+
# in your application. If you wish to explicitly disable this
|
40
|
+
# functionality, uncomment here.
|
41
|
+
# config.bundler = false
|
42
|
+
|
43
|
+
# An array of Bundler groups to avoid including in the war file.
|
44
|
+
# Defaults to ["development", "test", "assets"].
|
45
|
+
# config.bundle_without = []
|
46
|
+
|
47
|
+
# Other gems to be included. If you don't use Bundler or a gemspec
|
48
|
+
# file, you need to tell Warbler which gems your application needs
|
49
|
+
# so that they can be packaged in the archive.
|
50
|
+
# For Rails applications, the Rails gems are included by default
|
51
|
+
# unless the vendor/rails directory is present.
|
52
|
+
# config.gems += ["activerecord-jdbcmysql-adapter", "jruby-openssl"]
|
53
|
+
# config.gems << "tzinfo"
|
54
|
+
|
55
|
+
# Uncomment this if you don't want to package rails gem.
|
56
|
+
# config.gems -= ["rails"]
|
57
|
+
|
58
|
+
# The most recent versions of gems are used.
|
59
|
+
# You can specify versions of gems by using a hash assignment:
|
60
|
+
# config.gems["rails"] = "2.3.10"
|
61
|
+
|
62
|
+
# You can also use regexps or Gem::Dependency objects for flexibility or
|
63
|
+
# finer-grained control.
|
64
|
+
# config.gems << /^merb-/
|
65
|
+
# config.gems << Gem::Dependency.new("merb-core", "= 0.9.3")
|
66
|
+
|
67
|
+
# Include gem dependencies not mentioned specifically. Default is
|
68
|
+
# true, uncomment to turn off.
|
69
|
+
# config.gem_dependencies = false
|
70
|
+
|
71
|
+
# Array of regular expressions matching relative paths in gems to be
|
72
|
+
# excluded from the war. Defaults to empty, but you can set it like
|
73
|
+
# below, which excludes test files.
|
74
|
+
# config.gem_excludes = [/^(test|spec)\//]
|
75
|
+
|
76
|
+
# Pathmaps for controlling how application files are copied into the archive
|
77
|
+
# config.pathmaps.application = ["WEB-INF/%p"]
|
78
|
+
|
79
|
+
# Name of the archive (without the extension). Defaults to the basename
|
80
|
+
# of the project directory.
|
81
|
+
# config.jar_name = "mywar"
|
82
|
+
|
83
|
+
# Name of the MANIFEST.MF template for the war file. Defaults to a simple
|
84
|
+
# MANIFEST.MF that contains the version of Warbler used to create the war file.
|
85
|
+
# config.manifest_file = "config/MANIFEST.MF"
|
86
|
+
|
87
|
+
# When using the 'compiled' feature and specified, only these Ruby
|
88
|
+
# files will be compiled. Default is to compile all \.rb files in
|
89
|
+
# the application.
|
90
|
+
# config.compiled_ruby_files = FileList['app/**/*.rb']
|
91
|
+
|
92
|
+
# When set to true, Warbler will override the value of ENV['GEM_HOME'] even it
|
93
|
+
# has already been set. When set to false it will use any existing value of
|
94
|
+
# GEM_HOME if it is set.
|
95
|
+
# config.override_gem_home = true
|
96
|
+
|
97
|
+
# Allows for specifing custom executables
|
98
|
+
# config.executable = ["rake", "bin/rake"]
|
99
|
+
|
100
|
+
# Sets default (prefixed) parameters for the executables
|
101
|
+
# config.executable_params = "do:something"
|
102
|
+
|
103
|
+
# === War files only below here ===
|
104
|
+
|
105
|
+
# Path to the pre-bundled gem directory inside the war file. Default
|
106
|
+
# is 'WEB-INF/gems'. Specify path if gems are already bundled
|
107
|
+
# before running Warbler. This also sets 'gem.path' inside web.xml.
|
108
|
+
# config.gem_path = "WEB-INF/vendor/bundler_gems"
|
109
|
+
|
110
|
+
# Files for WEB-INF directory (next to web.xml). This contains
|
111
|
+
# web.xml by default. If there is an .erb-File it will be processed
|
112
|
+
# with webxml-config. You may want to exclude this file via
|
113
|
+
# config.excludes.
|
114
|
+
# config.webinf_files += FileList["jboss-web.xml"]
|
115
|
+
|
116
|
+
# Files to be included in the root of the webapp. Note that files in public
|
117
|
+
# will have the leading 'public/' part of the path stripped during staging.
|
118
|
+
# config.public_html = FileList["public/**/*", "doc/**/*"]
|
119
|
+
|
120
|
+
# Pathmaps for controlling how public HTML files are copied into the .war
|
121
|
+
# config.pathmaps.public_html = ["%{public/,}p"]
|
122
|
+
|
123
|
+
# Embedded webserver to use with the 'executable' feature. Currently supported
|
124
|
+
# webservers are:
|
125
|
+
# * <tt>winstone</tt> (default) - Winstone 0.9.10 from sourceforge
|
126
|
+
# * <tt>jenkins-ci.winstone</tt> - Improved Winstone from Jenkins CI
|
127
|
+
# * <tt>jetty</tt> - Embedded Jetty from Eclipse
|
128
|
+
# config.webserver = 'jetty'
|
129
|
+
|
130
|
+
# Value of RAILS_ENV for the webapp -- default as shown below
|
131
|
+
# config.webxml.rails.env = ENV['RAILS_ENV'] || 'production'
|
132
|
+
|
133
|
+
# Application booter to use, one of :rack, :rails, or :merb (autodetected by default)
|
134
|
+
# config.webxml.booter = :rails
|
135
|
+
|
136
|
+
# Set JRuby to run in 1.9 mode.
|
137
|
+
# config.webxml.jruby.compat.version = "1.9"
|
138
|
+
|
139
|
+
# When using the :rack booter, "Rackup" script to use.
|
140
|
+
# - For 'rackup.path', the value points to the location of the rackup
|
141
|
+
# script in the web archive file. You need to make sure this file
|
142
|
+
# gets included in the war, possibly by adding it to config.includes
|
143
|
+
# or config.webinf_files above.
|
144
|
+
# - For 'rackup', the rackup script you provide as an inline string
|
145
|
+
# is simply embedded in web.xml.
|
146
|
+
# The script is evaluated in a Rack::Builder to load the application.
|
147
|
+
# Examples:
|
148
|
+
# config.webxml.rackup.path = 'WEB-INF/hello.ru'
|
149
|
+
# config.webxml.rackup = %{require './lib/demo'; run Rack::Adapter::Camping.new(Demo)}
|
150
|
+
# config.webxml.rackup = require 'cgi' && CGI::escapeHTML(File.read("config.ru"))
|
151
|
+
|
152
|
+
# Control the pool of Rails runtimes. Leaving unspecified means
|
153
|
+
# the pool will grow as needed to service requests. It is recommended
|
154
|
+
# that you fix these values when running a production server!
|
155
|
+
# If you're using threadsafe! mode, you probably don't want to set these values,
|
156
|
+
# since 1 runtime(default for threadsafe mode) will be enough.
|
157
|
+
# config.webxml.jruby.min.runtimes = 2
|
158
|
+
# config.webxml.jruby.max.runtimes = 4
|
159
|
+
|
160
|
+
# JNDI data source name
|
161
|
+
# config.webxml.jndi = 'jdbc/rails'
|
162
|
+
end
|
data/config.ru
ADDED
@@ -0,0 +1,20 @@
|
|
1
|
+
require 'sinatra/base'
|
2
|
+
require 'sinatra/respond_with'
|
3
|
+
require 'llt/segmenter'
|
4
|
+
require 'llt/core/api'
|
5
|
+
|
6
|
+
class Api < Sinatra::Base
|
7
|
+
register Sinatra::RespondWith
|
8
|
+
helpers LLT::Core::Api::Helpers
|
9
|
+
|
10
|
+
get '/segment' do
|
11
|
+
typecast_params!(params)
|
12
|
+
text = extract_text(params)
|
13
|
+
segmenter = LLT::Segmenter.new(params)
|
14
|
+
sentences = segmenter.segment(text)
|
15
|
+
|
16
|
+
respond_to do |f|
|
17
|
+
f.xml { to_xml(sentences, params) }
|
18
|
+
end
|
19
|
+
end
|
20
|
+
end
|
@@ -0,0 +1,97 @@
|
|
1
|
+
require "llt/constants"
|
2
|
+
require "llt/core"
|
3
|
+
require "llt/logger"
|
4
|
+
require "llt/sentence"
|
5
|
+
|
6
|
+
module LLT
|
7
|
+
class Segmenter
|
8
|
+
include Constants::Abbreviations
|
9
|
+
include Core::Serviceable
|
10
|
+
|
11
|
+
uses_logger { Logger.new('Segmenter', default: :debug) }
|
12
|
+
|
13
|
+
def self.default_options
|
14
|
+
{
|
15
|
+
indexing: true,
|
16
|
+
newline_boundary: 2
|
17
|
+
}
|
18
|
+
end
|
19
|
+
|
20
|
+
# Abbreviations with boundary e.g. \bA
|
21
|
+
#
|
22
|
+
# This doesn't work in jruby (opened an issue at jruby/jruby#1269 ),
|
23
|
+
# so we have to change things as long as this is not fixed.
|
24
|
+
#
|
25
|
+
# (?<=\s|^) can be just \b in MRI 2.0 and upwards
|
26
|
+
AWB = ALL_ABBRS_PIPED.split('|').map { |abbr| "(?<=\\s|^)#{abbr}" }.join('|')
|
27
|
+
SENTENCE_CLOSER = /(?<!#{AWB})\.(?!\.)|[;\?!:]/
|
28
|
+
DIRECT_SPEECH_DELIMITER = /['"”]/
|
29
|
+
TRAILERS = /\)|<\/.*?>/
|
30
|
+
|
31
|
+
def segment(string, add_to: nil, **options)
|
32
|
+
setup(options)
|
33
|
+
# dump whitespace at the beginning and end!
|
34
|
+
string.strip!
|
35
|
+
sentences = scan_through_string(StringScanner.new(string))
|
36
|
+
add_to << sentences if add_to.respond_to?(:<<)
|
37
|
+
sentences
|
38
|
+
end
|
39
|
+
|
40
|
+
private
|
41
|
+
|
42
|
+
def setup(options)
|
43
|
+
@indexing = parse_option(:indexing, options)
|
44
|
+
@id = 0 if @indexing
|
45
|
+
|
46
|
+
nl_boundary = parse_option(:newline_boundary, options)
|
47
|
+
@sentence_closer = Regexp.union(SENTENCE_CLOSER, /\n{#{nl_boundary}}/)
|
48
|
+
end
|
49
|
+
|
50
|
+
def scan_through_string(scanner, sentences = [])
|
51
|
+
while scanner.rest?
|
52
|
+
sentence = scanner.scan_until(@sentence_closer) ||
|
53
|
+
rescue_no_delimiters(sentences, scanner)
|
54
|
+
sentence << trailing_delimiters(scanner)
|
55
|
+
|
56
|
+
sentence.strip!
|
57
|
+
unless sentence.empty?
|
58
|
+
curr_id = id
|
59
|
+
@logger.log("Segmented #{curr_id} #{sentence}")
|
60
|
+
sentences << Sentence.new(sentence, curr_id)
|
61
|
+
end
|
62
|
+
end
|
63
|
+
sentences
|
64
|
+
end
|
65
|
+
|
66
|
+
def id
|
67
|
+
if @indexing
|
68
|
+
@id += 1
|
69
|
+
end
|
70
|
+
end
|
71
|
+
|
72
|
+
def rescue_no_delimiters(sentences, scanner)
|
73
|
+
if sentences.any?
|
74
|
+
# broken off texts
|
75
|
+
scanner.scan_until(/$/)
|
76
|
+
else
|
77
|
+
# try a simple newline as delimiter, if there was no delimiter
|
78
|
+
scanner.reset
|
79
|
+
@sentence_closer = /\n/
|
80
|
+
if sent = scanner.scan_until(@sentence_closer)
|
81
|
+
sent
|
82
|
+
else
|
83
|
+
# when there is not even a new line, return all input
|
84
|
+
scanner.terminate
|
85
|
+
scanner.string
|
86
|
+
end
|
87
|
+
end
|
88
|
+
end
|
89
|
+
|
90
|
+
def trailing_delimiters(scanner)
|
91
|
+
trailers = [DIRECT_SPEECH_DELIMITER, TRAILERS]
|
92
|
+
trailers.each_with_object('') do |trailer, str|
|
93
|
+
str << scanner.scan(trailer).to_s # catches nil
|
94
|
+
end
|
95
|
+
end
|
96
|
+
end
|
97
|
+
end
|
data/lib/llt/sentence.rb
ADDED
@@ -0,0 +1,29 @@
|
|
1
|
+
# coding: utf-8
|
2
|
+
lib = File.expand_path('../lib', __FILE__)
|
3
|
+
$LOAD_PATH.unshift(lib) unless $LOAD_PATH.include?(lib)
|
4
|
+
require 'llt/segmenter/version'
|
5
|
+
|
6
|
+
Gem::Specification.new do |spec|
|
7
|
+
spec.name = "llt-segmenter"
|
8
|
+
spec.version = LLT::Segmenter::VERSION
|
9
|
+
spec.authors = ["Gernot Höflechner, Robert Lichstensteiner, Christof Sirk"]
|
10
|
+
spec.email = ["latin.language.toolkit@gmail.com"]
|
11
|
+
spec.description = %q{Segments text into sentences}
|
12
|
+
spec.summary = %q{Segments text into sentences}
|
13
|
+
spec.homepage = "http://latin-language-toolkit.net"
|
14
|
+
spec.license = "MIT"
|
15
|
+
|
16
|
+
spec.files = `git ls-files`.split($/)
|
17
|
+
spec.executables = spec.files.grep(%r{^bin/}) { |f| File.basename(f) }
|
18
|
+
spec.test_files = spec.files.grep(%r{^(test|spec|features)/})
|
19
|
+
spec.require_paths = ["lib"]
|
20
|
+
|
21
|
+
spec.add_development_dependency "bundler", "~> 1.3"
|
22
|
+
spec.add_development_dependency "rake"
|
23
|
+
spec.add_development_dependency "rspec"
|
24
|
+
spec.add_development_dependency "simplecov", "~> 0.7"
|
25
|
+
spec.add_dependency "warbler"
|
26
|
+
#spec.add_dependency "llt-core"
|
27
|
+
#spec.add_dependency "llt-constants"
|
28
|
+
#spec.add_dependency "llt-logger"
|
29
|
+
end
|
@@ -0,0 +1,56 @@
|
|
1
|
+
ENV['RACK_ENV'] = 'test'
|
2
|
+
|
3
|
+
require 'spec_helper'
|
4
|
+
require 'llt/segmenter/api'
|
5
|
+
require 'rack/test'
|
6
|
+
|
7
|
+
def app
|
8
|
+
Api
|
9
|
+
end
|
10
|
+
|
11
|
+
describe "segmenter api" do
|
12
|
+
include Rack::Test::Methods
|
13
|
+
|
14
|
+
describe '/segment' do
|
15
|
+
context "with URI as input" do
|
16
|
+
end
|
17
|
+
|
18
|
+
let(:text) {{text: "homo mittit. Marcus est."}}
|
19
|
+
|
20
|
+
context "with text as input" do
|
21
|
+
context "with accept header json" do
|
22
|
+
it "segments the given sentences" do
|
23
|
+
pending
|
24
|
+
get '/segment', text,
|
25
|
+
{"HTTP_ACCEPT" => "application/json"}
|
26
|
+
last_response.should be_ok
|
27
|
+
response = last_response.body
|
28
|
+
parsed_response = JSON.parse(response)
|
29
|
+
parsed_response.should have(2).items
|
30
|
+
end
|
31
|
+
end
|
32
|
+
|
33
|
+
context "with accept header xml" do
|
34
|
+
it "segments the given sentences" do
|
35
|
+
get '/segment', text,
|
36
|
+
{"HTTP_ACCEPT" => "application/xml"}
|
37
|
+
last_response.should be_ok
|
38
|
+
body = last_response.body
|
39
|
+
body.should =~ /<s n="1">homo mittit\.<\/s>/
|
40
|
+
body.should =~ /<s n="2">Marcus est\.<\/s>/
|
41
|
+
end
|
42
|
+
|
43
|
+
it "receives params for segmentation and markup" do
|
44
|
+
params = { indexing: false }.merge(text)
|
45
|
+
|
46
|
+
get '/segment', params,
|
47
|
+
{"HTTP_ACCEPT" => "application/xml"}
|
48
|
+
last_response.should be_ok
|
49
|
+
body = last_response.body
|
50
|
+
body.should =~ /<s>homo mittit\.<\/s>/
|
51
|
+
body.should =~ /<s>Marcus est\.<\/s>/
|
52
|
+
end
|
53
|
+
end
|
54
|
+
end
|
55
|
+
end
|
56
|
+
end
|
@@ -0,0 +1,259 @@
|
|
1
|
+
require 'spec_helper'
|
2
|
+
|
3
|
+
describe LLT::Segmenter do
|
4
|
+
let(:segmenter) { LLT::Segmenter.new }
|
5
|
+
describe "#segment" do
|
6
|
+
it "returns an array of LLT::Sentence elements" do
|
7
|
+
sentences = segmenter.segment("est.")
|
8
|
+
sentences.should have(1).item
|
9
|
+
sentences.first.should be_a LLT::Sentence
|
10
|
+
end
|
11
|
+
|
12
|
+
it "segments a paragraph of into sentences - easy" do
|
13
|
+
txt = "Cicero est. Caesar est."
|
14
|
+
sentences = segmenter.segment(txt)
|
15
|
+
sentences.should have(2).items
|
16
|
+
sentences[0].to_s.should == "Cicero est."
|
17
|
+
sentences[1].to_s.should == "Caesar est."
|
18
|
+
end
|
19
|
+
|
20
|
+
it "segments a paragraph of into sentences - complex" do
|
21
|
+
txt = "Cicero est; quis Caesar est? Marcus Antonius!"
|
22
|
+
sentences = segmenter.segment(txt)
|
23
|
+
sentences.should have(3).items
|
24
|
+
sentences[0].to_s.should == "Cicero est;"
|
25
|
+
sentences[1].to_s.should == "quis Caesar est?"
|
26
|
+
sentences[2].to_s.should == "Marcus Antonius!"
|
27
|
+
end
|
28
|
+
|
29
|
+
it "creates indices by default" do
|
30
|
+
txt = "Cicero est; quis Caesar est? Marcus Antonius!"
|
31
|
+
sentences = segmenter.segment(txt)
|
32
|
+
sentences.map(&:id).should == [1, 2, 3]
|
33
|
+
end
|
34
|
+
|
35
|
+
it "indices can be turned off" do
|
36
|
+
txt = "Cicero est; quis Caesar est? Marcus Antonius!"
|
37
|
+
sentences = segmenter.segment(txt, indexing: false)
|
38
|
+
sentences.map(&:id).should == [nil, nil, nil]
|
39
|
+
end
|
40
|
+
|
41
|
+
it "handles abbreviated names" do
|
42
|
+
txt = "C. Caesar est. M. Tullius Cicero est."
|
43
|
+
sentences = segmenter.segment(txt)
|
44
|
+
sentences.should have(2).items
|
45
|
+
sentences[0].to_s.should == "C. Caesar est."
|
46
|
+
sentences[1].to_s.should == "M. Tullius Cicero est."
|
47
|
+
end
|
48
|
+
|
49
|
+
it "handles abbreviated dates" do
|
50
|
+
txt = "Is dies erat a. d. V Kal. Apr. L. Pisone, A. Gabinio consulibus."
|
51
|
+
sentences = segmenter.segment(txt)
|
52
|
+
sentences.should have(1).item
|
53
|
+
end
|
54
|
+
|
55
|
+
it "handles more dates" do
|
56
|
+
txt = "Is dies erat a. d. V Ian. Non. Feb. L. App. Pisone ."
|
57
|
+
sentences = segmenter.segment(txt)
|
58
|
+
puts sentences
|
59
|
+
sentences.should have(1).item
|
60
|
+
end
|
61
|
+
|
62
|
+
it "are only triggered when they have a leading word boundary" do
|
63
|
+
# spec might seem strange, but this didn't work from the start on
|
64
|
+
txt = "erat nauta. est."
|
65
|
+
sentences = segmenter.segment(txt)
|
66
|
+
sentences.should have(2).items
|
67
|
+
end
|
68
|
+
|
69
|
+
it "handles dates even with numbers that have an abbr dot" do
|
70
|
+
pending('Not solved yet. Think of M.') do
|
71
|
+
txt = "Is dies erat a. d. V. Kal. Apr. L. Pisone, A. Gabinio consulibus."
|
72
|
+
sentences = segmenter.segment(txt)
|
73
|
+
sentences.should have(1).item
|
74
|
+
end
|
75
|
+
end
|
76
|
+
|
77
|
+
it "splits at :" do
|
78
|
+
txt = 'iubent: fugere manus.'
|
79
|
+
sentences = segmenter.segment(txt)
|
80
|
+
sentences.should have(2).items
|
81
|
+
end
|
82
|
+
|
83
|
+
it "doesn't create empty sentences" do
|
84
|
+
txt = "text.\n\n\ntext."
|
85
|
+
sentences = segmenter.segment(txt)
|
86
|
+
sentences.should have(2).items
|
87
|
+
end
|
88
|
+
|
89
|
+
context "with embedded xml" do
|
90
|
+
it "doesn't break up before xml closing tags" do
|
91
|
+
txt = '<grc> text.</grc>'
|
92
|
+
sentences = segmenter.segment(txt)
|
93
|
+
sentences.should have(1).item
|
94
|
+
end
|
95
|
+
end
|
96
|
+
|
97
|
+
context "newline (\\n) handling" do
|
98
|
+
it "works when in between" do
|
99
|
+
txt = "Filia est.\nFilius est."
|
100
|
+
sentences = segmenter.segment(txt)
|
101
|
+
sentences.should have(2).items
|
102
|
+
sentences[0].to_s.should == "Filia est."
|
103
|
+
sentences[1].to_s.should == "Filius est."
|
104
|
+
end
|
105
|
+
|
106
|
+
it "works when at the end of a text" do
|
107
|
+
sentences = segmenter.segment("Marcus est.\n")
|
108
|
+
sentences.should have(1).item
|
109
|
+
sentences.first.to_s.should == 'Marcus est.'
|
110
|
+
end
|
111
|
+
|
112
|
+
it "works with newline and space in between and no new line at the end" do
|
113
|
+
txt = "Fīlius rēgīnae erat.\n Rēgīnam aurō dōnābunt."
|
114
|
+
sentences = segmenter.segment(txt)
|
115
|
+
sentences.should have(2).items
|
116
|
+
sentences[0].to_s.should == "Fīlius rēgīnae erat."
|
117
|
+
sentences[1].to_s.should == "Rēgīnam aurō dōnābunt."
|
118
|
+
end
|
119
|
+
|
120
|
+
it "works with newline and space in between and new line at the end" do
|
121
|
+
txt = "Fīlius rēgīnae erat nauta.\n Rēgīnam aurō dōnābunt.\n"
|
122
|
+
sentences = segmenter.segment(txt)
|
123
|
+
sentences.should have(2).items
|
124
|
+
sentences[0].to_s.should == "Fīlius rēgīnae erat nauta."
|
125
|
+
sentences[1].to_s.should == "Rēgīnam aurō dōnābunt."
|
126
|
+
end
|
127
|
+
|
128
|
+
it "treats an empty line as delimiter - might e.g. appear in book titles" do
|
129
|
+
txt = "Marcus est\n\nMarcus est."
|
130
|
+
sentences = segmenter.segment(txt)
|
131
|
+
sentences.should have(2).item
|
132
|
+
end
|
133
|
+
|
134
|
+
it "number of newlines that count as sentence boundary can be given as option" do
|
135
|
+
txt1 = "Marcus est\n\nMarcus est."
|
136
|
+
txt2 = "Marcus est\n\n\nMarcus est."
|
137
|
+
sentences1 = segmenter.segment(txt1, newline_boundary: 3)
|
138
|
+
sentences2 = segmenter.segment(txt2, newline_boundary: 3)
|
139
|
+
sentences1.should have(1).item
|
140
|
+
sentences2.should have(2).item
|
141
|
+
end
|
142
|
+
end
|
143
|
+
|
144
|
+
it "handles quantified texts" do
|
145
|
+
txt = "Fēmina puellae pecūniam dabat.\n Fīlia poētae in viīs errābat.\n"
|
146
|
+
sentences = segmenter.segment(txt)
|
147
|
+
sentences.should have(2).item
|
148
|
+
end
|
149
|
+
|
150
|
+
it "is not disturbed by leading or trailing whitespace" do
|
151
|
+
txt = ' Marcus est. Marcus est. '
|
152
|
+
sentences = segmenter.segment(txt)
|
153
|
+
sentences.should have(2).item
|
154
|
+
end
|
155
|
+
|
156
|
+
context "with ellipsis punctuation" do
|
157
|
+
it "handles them at the end of a sentence" do
|
158
|
+
txt = 'Marcus ...'
|
159
|
+
sentences = segmenter.segment(txt)
|
160
|
+
sentences.should have(1).item
|
161
|
+
end
|
162
|
+
|
163
|
+
it "handles them in the midst of a sentence" do
|
164
|
+
pending 'Tough to do'
|
165
|
+
end
|
166
|
+
end
|
167
|
+
|
168
|
+
context "direct speech delimiter" do
|
169
|
+
context "with '" do
|
170
|
+
it "handles basic cases when on the outside of the punctuation" do
|
171
|
+
txt = "'Marcus est.'"
|
172
|
+
sentences = segmenter.segment(txt)
|
173
|
+
sentences.should have(1).item
|
174
|
+
end
|
175
|
+
|
176
|
+
it "handles basic cases when on the inside of the punctuation" do
|
177
|
+
txt = "'Marcus est'?"
|
178
|
+
sentences = segmenter.segment(txt)
|
179
|
+
sentences.should have(1).item
|
180
|
+
end
|
181
|
+
end
|
182
|
+
|
183
|
+
context 'with "' do
|
184
|
+
it "handles basic cases when on the outside of the punctuation" do
|
185
|
+
txt = '"Marcus est."'
|
186
|
+
sentences = segmenter.segment(txt)
|
187
|
+
sentences.should have(1).item
|
188
|
+
end
|
189
|
+
|
190
|
+
it "handles basic cases when on the inside of the punctuation" do
|
191
|
+
txt = '"Marcus est"?'
|
192
|
+
sentences = segmenter.segment(txt)
|
193
|
+
sentences.should have(1).item
|
194
|
+
end
|
195
|
+
end
|
196
|
+
|
197
|
+
context 'with ” (attention: this is NOT the same as "' do
|
198
|
+
it "handles basic cases when on the outside of the punctuation" do
|
199
|
+
txt = '”Marcus est.”'
|
200
|
+
sentences = segmenter.segment(txt)
|
201
|
+
sentences.should have(1).item
|
202
|
+
end
|
203
|
+
|
204
|
+
it "handles basic cases when on the inside of the punctuation" do
|
205
|
+
txt = '”Marcus est”?'
|
206
|
+
sentences = segmenter.segment(txt)
|
207
|
+
sentences.should have(1).item
|
208
|
+
end
|
209
|
+
end
|
210
|
+
end
|
211
|
+
|
212
|
+
it "catches trailing parenthesis" do
|
213
|
+
txt = "Marcus est. (Marcus est.) Marcus est."
|
214
|
+
sentences = segmenter.segment(txt)
|
215
|
+
sentences.should have(3).items
|
216
|
+
sentences[0].to_s.should == 'Marcus est.'
|
217
|
+
sentences[1].to_s.should == '(Marcus est.)'
|
218
|
+
sentences[2].to_s.should == 'Marcus est.'
|
219
|
+
end
|
220
|
+
|
221
|
+
it "handles broken off texts - the rest is an own sentence" do
|
222
|
+
txt = "Marcus est. Marcus est"
|
223
|
+
sentences = segmenter.segment(txt)
|
224
|
+
sentences.should have(2).item
|
225
|
+
end
|
226
|
+
|
227
|
+
context "with no delimiters present" do
|
228
|
+
it "tries to fallback to single newline boundary" do
|
229
|
+
txt = "Marcus est\nMarcus est"
|
230
|
+
segmenter.segment(txt).should have(2).items
|
231
|
+
end
|
232
|
+
|
233
|
+
it "returns the whole input as segment when there are no newlines" do
|
234
|
+
txt = "Marcus est"
|
235
|
+
segmenter.segment(txt).should have(1).item
|
236
|
+
end
|
237
|
+
end
|
238
|
+
|
239
|
+
describe "takes an optional keyword argument add_to" do
|
240
|
+
class ParagraphDummy
|
241
|
+
attr_reader :sentences
|
242
|
+
def initialize; @sentences = []; end
|
243
|
+
def <<(sentences); @sentences += sentences; end
|
244
|
+
end
|
245
|
+
|
246
|
+
it "adds the result to the given object if #<< is implemented" do
|
247
|
+
paragraph = ParagraphDummy.new
|
248
|
+
s = segmenter.segment("", add_to: paragraph)
|
249
|
+
paragraph.sentences.should == s
|
250
|
+
end
|
251
|
+
|
252
|
+
it "does nothing to the given object when #<< it does not respond to" do
|
253
|
+
object = double(respond_to?: false)
|
254
|
+
object.should_not receive(:<<)
|
255
|
+
segmenter.segment("", add_to: object)
|
256
|
+
end
|
257
|
+
end
|
258
|
+
end
|
259
|
+
end
|
data/spec/spec_helper.rb
ADDED
@@ -0,0 +1,26 @@
|
|
1
|
+
require 'simplecov'
|
2
|
+
require 'coveralls'
|
3
|
+
|
4
|
+
Coveralls.wear!
|
5
|
+
|
6
|
+
SimpleCov.formatter = SimpleCov::Formatter::MultiFormatter[
|
7
|
+
SimpleCov::Formatter::HTMLFormatter,
|
8
|
+
Coveralls::SimpleCov::Formatter
|
9
|
+
]
|
10
|
+
|
11
|
+
SimpleCov.start do
|
12
|
+
add_filter '/spec/'
|
13
|
+
end
|
14
|
+
|
15
|
+
$LOAD_PATH.unshift File.expand_path('../../lib', __FILE__)
|
16
|
+
require 'llt/segmenter'
|
17
|
+
|
18
|
+
if defined?(LLT::Logger)
|
19
|
+
LLT::Logger.level = nil
|
20
|
+
end
|
21
|
+
|
22
|
+
RSpec.configure do |config|
|
23
|
+
config.treat_symbols_as_metadata_keys_with_true_values = true
|
24
|
+
config.run_all_when_everything_filtered = true
|
25
|
+
config.filter_run :focus
|
26
|
+
end
|
metadata
ADDED
@@ -0,0 +1,134 @@
|
|
1
|
+
--- !ruby/object:Gem::Specification
|
2
|
+
name: llt-segmenter
|
3
|
+
version: !ruby/object:Gem::Version
|
4
|
+
version: 0.0.1
|
5
|
+
platform: ruby
|
6
|
+
authors:
|
7
|
+
- Gernot Höflechner, Robert Lichstensteiner, Christof Sirk
|
8
|
+
autorequire:
|
9
|
+
bindir: bin
|
10
|
+
cert_chain: []
|
11
|
+
date: 2013-12-08 00:00:00.000000000 Z
|
12
|
+
dependencies:
|
13
|
+
- !ruby/object:Gem::Dependency
|
14
|
+
name: bundler
|
15
|
+
requirement: !ruby/object:Gem::Requirement
|
16
|
+
requirements:
|
17
|
+
- - "~>"
|
18
|
+
- !ruby/object:Gem::Version
|
19
|
+
version: '1.3'
|
20
|
+
type: :development
|
21
|
+
prerelease: false
|
22
|
+
version_requirements: !ruby/object:Gem::Requirement
|
23
|
+
requirements:
|
24
|
+
- - "~>"
|
25
|
+
- !ruby/object:Gem::Version
|
26
|
+
version: '1.3'
|
27
|
+
- !ruby/object:Gem::Dependency
|
28
|
+
name: rake
|
29
|
+
requirement: !ruby/object:Gem::Requirement
|
30
|
+
requirements:
|
31
|
+
- - ">="
|
32
|
+
- !ruby/object:Gem::Version
|
33
|
+
version: '0'
|
34
|
+
type: :development
|
35
|
+
prerelease: false
|
36
|
+
version_requirements: !ruby/object:Gem::Requirement
|
37
|
+
requirements:
|
38
|
+
- - ">="
|
39
|
+
- !ruby/object:Gem::Version
|
40
|
+
version: '0'
|
41
|
+
- !ruby/object:Gem::Dependency
|
42
|
+
name: rspec
|
43
|
+
requirement: !ruby/object:Gem::Requirement
|
44
|
+
requirements:
|
45
|
+
- - ">="
|
46
|
+
- !ruby/object:Gem::Version
|
47
|
+
version: '0'
|
48
|
+
type: :development
|
49
|
+
prerelease: false
|
50
|
+
version_requirements: !ruby/object:Gem::Requirement
|
51
|
+
requirements:
|
52
|
+
- - ">="
|
53
|
+
- !ruby/object:Gem::Version
|
54
|
+
version: '0'
|
55
|
+
- !ruby/object:Gem::Dependency
|
56
|
+
name: simplecov
|
57
|
+
requirement: !ruby/object:Gem::Requirement
|
58
|
+
requirements:
|
59
|
+
- - "~>"
|
60
|
+
- !ruby/object:Gem::Version
|
61
|
+
version: '0.7'
|
62
|
+
type: :development
|
63
|
+
prerelease: false
|
64
|
+
version_requirements: !ruby/object:Gem::Requirement
|
65
|
+
requirements:
|
66
|
+
- - "~>"
|
67
|
+
- !ruby/object:Gem::Version
|
68
|
+
version: '0.7'
|
69
|
+
- !ruby/object:Gem::Dependency
|
70
|
+
name: warbler
|
71
|
+
requirement: !ruby/object:Gem::Requirement
|
72
|
+
requirements:
|
73
|
+
- - ">="
|
74
|
+
- !ruby/object:Gem::Version
|
75
|
+
version: '0'
|
76
|
+
type: :runtime
|
77
|
+
prerelease: false
|
78
|
+
version_requirements: !ruby/object:Gem::Requirement
|
79
|
+
requirements:
|
80
|
+
- - ">="
|
81
|
+
- !ruby/object:Gem::Version
|
82
|
+
version: '0'
|
83
|
+
description: Segments text into sentences
|
84
|
+
email:
|
85
|
+
- latin.language.toolkit@gmail.com
|
86
|
+
executables: []
|
87
|
+
extensions: []
|
88
|
+
extra_rdoc_files: []
|
89
|
+
files:
|
90
|
+
- ".gitignore"
|
91
|
+
- ".rspec"
|
92
|
+
- ".travis.yml"
|
93
|
+
- Gemfile
|
94
|
+
- LICENSE
|
95
|
+
- README.md
|
96
|
+
- Rakefile
|
97
|
+
- config.ru
|
98
|
+
- config/warble.rb
|
99
|
+
- lib/llt/segmenter.rb
|
100
|
+
- lib/llt/segmenter/api.rb
|
101
|
+
- lib/llt/segmenter/version.rb
|
102
|
+
- lib/llt/sentence.rb
|
103
|
+
- llt-segmenter.gemspec
|
104
|
+
- spec/lib/llt/segmenter/api_spec.rb
|
105
|
+
- spec/lib/llt/segmenter_spec.rb
|
106
|
+
- spec/spec_helper.rb
|
107
|
+
homepage: http://latin-language-toolkit.net
|
108
|
+
licenses:
|
109
|
+
- MIT
|
110
|
+
metadata: {}
|
111
|
+
post_install_message:
|
112
|
+
rdoc_options: []
|
113
|
+
require_paths:
|
114
|
+
- lib
|
115
|
+
required_ruby_version: !ruby/object:Gem::Requirement
|
116
|
+
requirements:
|
117
|
+
- - ">="
|
118
|
+
- !ruby/object:Gem::Version
|
119
|
+
version: '0'
|
120
|
+
required_rubygems_version: !ruby/object:Gem::Requirement
|
121
|
+
requirements:
|
122
|
+
- - ">="
|
123
|
+
- !ruby/object:Gem::Version
|
124
|
+
version: '0'
|
125
|
+
requirements: []
|
126
|
+
rubyforge_project:
|
127
|
+
rubygems_version: 2.1.5
|
128
|
+
signing_key:
|
129
|
+
specification_version: 4
|
130
|
+
summary: Segments text into sentences
|
131
|
+
test_files:
|
132
|
+
- spec/lib/llt/segmenter/api_spec.rb
|
133
|
+
- spec/lib/llt/segmenter_spec.rb
|
134
|
+
- spec/spec_helper.rb
|