srx-english 0.1.0
Sign up to get free protection for your applications and to get access to all the features.
- data/.gitignore +3 -0
- data/README.rdoc +75 -0
- data/changelog.txt +2 -0
- data/features/sentence_splitter.feature +34 -0
- data/features/steps/sentence_splitter.rb +17 -0
- data/features/steps/word_splitter.rb +26 -0
- data/features/word_splitter.feature +17 -0
- data/lib/srx/english/sentence_splitter.rb +96 -0
- data/lib/srx/english/word_splitter.rb +57 -0
- data/srx-english.gemspec +23 -0
- metadata +77 -0
data/.gitignore
ADDED
data/README.rdoc
ADDED
@@ -0,0 +1,75 @@
|
|
1
|
+
== srx-english
|
2
|
+
|
3
|
+
* https://github.com/apohllo/srx-english
|
4
|
+
|
5
|
+
= DESCRIPTION
|
6
|
+
|
7
|
+
'srx-english' is a Ruby library containing English sentence and word segmentation rules.
|
8
|
+
The sentence segementation rules are based on rules defined by Marcin Miłkowski:
|
9
|
+
http://morfologik.blogspot.com/2009/11/talking-about-srx-in-lt-during-ltc.html
|
10
|
+
|
11
|
+
= FEATURES/PROBLEMS
|
12
|
+
|
13
|
+
* this library is generated by 'srx2ruby' which has some limitations and might
|
14
|
+
be not 100% SRX standard compliant.
|
15
|
+
|
16
|
+
= INSTALL
|
17
|
+
|
18
|
+
Standard rubygems installation:
|
19
|
+
|
20
|
+
$ gem install srx-english
|
21
|
+
|
22
|
+
= BASIC USAGE
|
23
|
+
|
24
|
+
The library defines the SRX::English::Sentence class allowing to iterate
|
25
|
+
over the matched sentences:
|
26
|
+
|
27
|
+
require 'srx/english/sentence_splitter'
|
28
|
+
|
29
|
+
text =<<-END
|
30
|
+
This is e.g. Mr. Smith, who talks slowly... And this is another sentence.
|
31
|
+
END
|
32
|
+
|
33
|
+
splitter = SRX::English::SentenceSplitter.new(text)
|
34
|
+
splitter.each do |sentence|
|
35
|
+
puts sentence.gsub(/\n|\r/,"")
|
36
|
+
end
|
37
|
+
# This is e.g. Mr. Smith, who talks slowly...
|
38
|
+
# And this is another sentence.
|
39
|
+
|
40
|
+
require 'srx/english/word_splitter'
|
41
|
+
|
42
|
+
sentence = 'My home is my castle.'
|
43
|
+
splitter = SRX::English::WordSplitter.new(sentence)
|
44
|
+
splitter.each do |word,type|
|
45
|
+
puts "'#{word}' #{type}"
|
46
|
+
end
|
47
|
+
# 'My' word
|
48
|
+
# ' ' other
|
49
|
+
# 'home' word
|
50
|
+
# ' ' other
|
51
|
+
# 'is' word
|
52
|
+
# ' ' other
|
53
|
+
# 'my' word
|
54
|
+
# ' ' other
|
55
|
+
# 'castle' word
|
56
|
+
# '.' punct
|
57
|
+
|
58
|
+
|
59
|
+
== LICENSE
|
60
|
+
|
61
|
+
Copyright (C) 2011 Aleksander Pohl, Marcin Miłkowski, Jarosław Lipski
|
62
|
+
|
63
|
+
This program is free software: you can redistribute it and/or modify
|
64
|
+
it under the terms of the GNU General Public License as published by
|
65
|
+
the Free Software Foundation, either version 3 of the License, or
|
66
|
+
(at your option) any later version.
|
67
|
+
|
68
|
+
This program is distributed in the hope that it will be useful,
|
69
|
+
but WITHOUT ANY WARRANTY; without even the implied warranty of
|
70
|
+
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
71
|
+
GNU General Public License for more details.
|
72
|
+
|
73
|
+
== FEEDBACK
|
74
|
+
|
75
|
+
* mailto:apohllo@o2.pl
|
data/changelog.txt
ADDED
@@ -0,0 +1,34 @@
|
|
1
|
+
Feature: sentence splitter
|
2
|
+
Scenario: splitting text
|
3
|
+
Given a text
|
4
|
+
"""
|
5
|
+
It [really!] works.
|
6
|
+
"""
|
7
|
+
When the text is split
|
8
|
+
Then the following sentences should be detected
|
9
|
+
| sentence |
|
10
|
+
#-------------------- #
|
11
|
+
| It [really!] works. |
|
12
|
+
|
13
|
+
Given a text
|
14
|
+
"""
|
15
|
+
This is e.g. Mr. Smith, who talks slowly... And this is another sentence.
|
16
|
+
"""
|
17
|
+
When the text is split
|
18
|
+
Then the following sentences should be detected
|
19
|
+
| sentence |
|
20
|
+
#---------------------------------------------#
|
21
|
+
| This is e.g. Mr. Smith, who talks slowly... |
|
22
|
+
| And this is another sentence. |
|
23
|
+
|
24
|
+
Given a text
|
25
|
+
"""
|
26
|
+
Leave me alone!, he yelled. I am in the U.S. Army. Charles (Ind.) said he.
|
27
|
+
"""
|
28
|
+
When the text is split
|
29
|
+
Then the following sentences should be detected
|
30
|
+
| sentence |
|
31
|
+
#-----------------------------#
|
32
|
+
| Leave me alone!, he yelled. |
|
33
|
+
| I am in the U.S. Army. |
|
34
|
+
| Charles (Ind.) said he. |
|
@@ -0,0 +1,17 @@
|
|
1
|
+
# encoding: utf-8
|
2
|
+
$:.unshift "lib"
|
3
|
+
require 'srx/english/sentence_splitter'
|
4
|
+
|
5
|
+
Given /^a text$/ do |text|
|
6
|
+
@text = text
|
7
|
+
end
|
8
|
+
|
9
|
+
When /^the text is split$/ do
|
10
|
+
@splitter = SRX::English::SentenceSplitter.new(@text)
|
11
|
+
end
|
12
|
+
|
13
|
+
Then /^the following sentences should be detected$/ do |table|
|
14
|
+
table.hashes.zip(@splitter.to_a).each do |expected,returned|
|
15
|
+
returned.gsub(/\s*\n/," ").strip.should == expected[:sentence]
|
16
|
+
end
|
17
|
+
end
|
@@ -0,0 +1,26 @@
|
|
1
|
+
# encoding: utf-8
|
2
|
+
$:.unshift "lib"
|
3
|
+
#require 'srx/english/word_splitter'
|
4
|
+
|
5
|
+
Given /^a sentence '([^']+)'$/ do |sentence|
|
6
|
+
@sentence = sentence.force_encoding('utf-8')
|
7
|
+
end
|
8
|
+
|
9
|
+
When /^the sentence is split$/ do
|
10
|
+
@splitter = SRX::English::WordSplitter.new(@sentence)
|
11
|
+
end
|
12
|
+
|
13
|
+
Then /^the following segments should be detected$/ do |table|
|
14
|
+
table.hashes.zip(@splitter.to_a).each do |expected,returned|
|
15
|
+
returned[0].should == expected[:segment].gsub(/'/,"")
|
16
|
+
returned[1].should == expected[:type].to_sym
|
17
|
+
end
|
18
|
+
end
|
19
|
+
|
20
|
+
Then /^the following non-blank segments should be detected$/ do |table|
|
21
|
+
segments = @splitter.select{|s| s[1] != :other}
|
22
|
+
table.hashes.zip(segments).each do |expected,returned|
|
23
|
+
returned[0].should == expected[:segment].gsub(/'/,"")
|
24
|
+
returned[1].should == expected[:type].to_sym
|
25
|
+
end
|
26
|
+
end
|
@@ -0,0 +1,17 @@
|
|
1
|
+
Feature: word splitter
|
2
|
+
Scenario: splitting a sentence
|
3
|
+
Given a sentence 'My home is my castle.'
|
4
|
+
When the sentence is split
|
5
|
+
Then the following segments should be detected
|
6
|
+
| segment | type |
|
7
|
+
#-----------------#
|
8
|
+
| My | word |
|
9
|
+
| ' ' | other |
|
10
|
+
| home | word |
|
11
|
+
| ' ' | other |
|
12
|
+
| is | word |
|
13
|
+
| ' ' | other |
|
14
|
+
| my | word |
|
15
|
+
| ' ' | other |
|
16
|
+
| castle | word |
|
17
|
+
| . | punct |
|
@@ -0,0 +1,96 @@
|
|
1
|
+
#encoding: utf-8
|
2
|
+
require 'stringio'
|
3
|
+
require 'term/ansicolor'
|
4
|
+
module SRX
|
5
|
+
module English
|
6
|
+
RULES =
|
7
|
+
[["(?:['\"„][\\.!?…]['\"”]\\s)|(?:[^\\.]\\s[A-Z]\\.\\s)|(?:\\b(?:St|Gen|Hon|Prof|Dr|Mr|Ms|Mrs|[JS]r|Col|Maj|Brig|Sgt|Capt|Cmnd|Sen|Rev|Rep|Revd)\\.\\s)|(?:\\b(?:St|Gen|Hon|Prof|Dr|Mr|Ms|Mrs|[JS]r|Col|Maj|Brig|Sgt|Capt|Cmnd|Sen|Rev|Rep|Revd)\\.\\s[A-Z]\\.\\s)|(?:\\bApr\\.\\s)|(?:\\bAug\\.\\s)|(?:\\bBros\\.\\s)|(?:\\bCo\\.\\s)|(?:\\bCorp\\.\\s)|(?:\\bDec\\.\\s)|(?:\\bDist\\.\\s)|(?:\\bFeb\\.\\s)|(?:\\bInc\\.\\s)|(?:\\bJan\\.\\s)|(?:\\bJul\\.\\s)|(?:\\bJun\\.\\s)|(?:\\bMar\\.\\s)|(?:\\bNov\\.\\s)|(?:\\bOct\\.\\s)|(?:\\bPh\\.?D\\.\\s)|(?:\\bSept?\\.\\s)|(?:\\b\\p{Lu}\\.\\p{Lu}\\.\\s)|(?:\\b\\p{Lu}\\.\\s\\p{Lu}\\.\\s)|(?:\\bcf\\.\\s)|(?:\\be\\.g\\.\\s)|(?:\\besp\\.\\s)|(?:\\bet\\b\\s\\bal\\.\\s)|(?:\\bvs\\.\\s)|(?:\\p{Ps}[!?]+\\p{Pe} )",
|
8
|
+
nil,
|
9
|
+
false],
|
10
|
+
["(?:[\\.\\s]\\p{L}{1,2}\\.\\s)", "[\\p{N}\\p{Ll}]", false],
|
11
|
+
["(?:[\\[\\(]*\\.\\.\\.[\\]\\)]* )", "[^\\p{Lu}]", false],
|
12
|
+
["(?:\\b(?:pp|[Vv]iz|i\\.?\\s*e|[Vvol]|[Rr]col|maj|Lt|[Ff]ig|[Ff]igs|[Vv]iz|[Vv]ols|[Aa]pprox|[Ii]ncl|Pres|[Dd]ept|min|max|[Gg]ovt|lb|ft|c\\.?\\s*f|vs)\\.\\s)",
|
13
|
+
"[^\\p{Lu}]|I",
|
14
|
+
false],
|
15
|
+
["(?:\\b[Ee]tc\\.\\s)", "[^p{Lu}]", false],
|
16
|
+
["(?:[\\.!?…]+\\p{Pe} )|(?:[\\[\\(]*…[\\]\\)]* )", "\\p{Ll}", false],
|
17
|
+
["(?:\\b\\p{L}\\.)", "\\p{L}\\.", false],
|
18
|
+
["(?:\\b\\p{L}\\.\\s)", "\\p{L}\\.\\s", false],
|
19
|
+
["(?:\\b[Ff]igs?\\.\\s)|(?:\\b[nN]o\\.\\s)", "\\p{N}", false],
|
20
|
+
["(?:[\"”']\\s*)", "\\s*\\p{Ll}", false],
|
21
|
+
["(?:[\\.!?…][\\u00BB\\u2019\\u201D\\u203A\"'\\p{Pe}\\u0002]*\\s)|(?:\\r?\\n)",
|
22
|
+
nil,
|
23
|
+
true],
|
24
|
+
["(?:[\\.!?…]['\"\\u00BB\\u2019\\u201D\\u203A\\p{Pe}\\u0002]*)",
|
25
|
+
"\\p{Lu}[^\\p{Lu}]",
|
26
|
+
true],
|
27
|
+
["(?:\\s\\p{L}[\\.!?…]\\s)", "\\p{Lu}\\p{Ll}", true]]
|
28
|
+
BEFORE_RE = /(?:#{RULES.map{|s,e,v| "(#{s})"}.join("|")})\Z/m
|
29
|
+
REGEXPS = RULES.map{|s,e,v| [/(#{s})\Z/m,/\A(#{e})/m,v] }
|
30
|
+
FIRST_CHAR = /\A./m
|
31
|
+
|
32
|
+
|
33
|
+
class SentenceSplitter
|
34
|
+
include Enumerable
|
35
|
+
|
36
|
+
attr_accessor :input
|
37
|
+
attr_writer :debug
|
38
|
+
|
39
|
+
# The sentence splitter is initialized with the +text+ to split.
|
40
|
+
# This might be a String or a IO object.
|
41
|
+
def initialize(text=nil)
|
42
|
+
if text.is_a?(String)
|
43
|
+
@input = StringIO.new(text,"r:utf-8")
|
44
|
+
else
|
45
|
+
@input = text
|
46
|
+
end
|
47
|
+
end
|
48
|
+
|
49
|
+
# Iterate over the sentences in the text.
|
50
|
+
# If the text is nil, exception is raised.
|
51
|
+
def each
|
52
|
+
raise "Invalid argument - text is nil" if @input.nil?
|
53
|
+
buffer_length = 10
|
54
|
+
sentence = ""
|
55
|
+
before_buffer = ""
|
56
|
+
@input.pos = 0
|
57
|
+
after_buffer = buffer_length.times.map{|i| @input.readchar}.join("")
|
58
|
+
matched_rule = nil
|
59
|
+
while(!@input.eof?) do
|
60
|
+
matched_before = BEFORE_RE.match(before_buffer)
|
61
|
+
break_detected = false
|
62
|
+
if matched_before
|
63
|
+
start_index = (matched_before.size - 1).times.find do |index|
|
64
|
+
matched_before[index+1]
|
65
|
+
end
|
66
|
+
if @debug
|
67
|
+
puts "#{before_buffer}|#{after_buffer.gsub(/\n/,"\\n")}"
|
68
|
+
end
|
69
|
+
REGEXPS.each do |before_re,after_re,value|
|
70
|
+
# skip the whole match
|
71
|
+
if before_re.match(before_buffer) && after_re.match(after_buffer)
|
72
|
+
break_detected = true
|
73
|
+
color = value ? :red : :green
|
74
|
+
if @debug
|
75
|
+
sentence << Term::ANSIColor.send(color,"<#{before_re}:#{after_re}>")
|
76
|
+
end
|
77
|
+
if value
|
78
|
+
yield sentence
|
79
|
+
sentence = ""
|
80
|
+
end
|
81
|
+
break
|
82
|
+
end
|
83
|
+
end
|
84
|
+
end
|
85
|
+
next_after = @input.readchar
|
86
|
+
before_buffer.sub!(FIRST_CHAR,"") if before_buffer.size >= buffer_length
|
87
|
+
after_buffer.sub!(FIRST_CHAR,"")
|
88
|
+
before_buffer << $&
|
89
|
+
sentence << $&
|
90
|
+
after_buffer << next_after
|
91
|
+
end
|
92
|
+
yield sentence + after_buffer unless sentence.empty? || after_buffer.empty?
|
93
|
+
end
|
94
|
+
end
|
95
|
+
end
|
96
|
+
end
|
@@ -0,0 +1,57 @@
|
|
1
|
+
# encoding: utf-8
|
2
|
+
|
3
|
+
module SRX
|
4
|
+
module English
|
5
|
+
class WordSplitter
|
6
|
+
include Enumerable
|
7
|
+
|
8
|
+
attr_accessor :sentence
|
9
|
+
SPLIT_RULES = {
|
10
|
+
:word => "\\p{Alpha}\\p{Word}*",
|
11
|
+
:number => "\\p{Digit}+(?:[:., _/-]\\p{Digit}+)*",
|
12
|
+
:punct => "\\p{Punct}",
|
13
|
+
:graph => "\\p{Graph}",
|
14
|
+
:other => "[^\\p{Word}\\p{Graph}]+"
|
15
|
+
}
|
16
|
+
|
17
|
+
SPLIT_RE = /#{SPLIT_RULES.values.map{|v| "(#{v})"}.join("|")}/m
|
18
|
+
|
19
|
+
# The initializer accepts a +sentence+, which might be a
|
20
|
+
# Sentence instance or a String instance.
|
21
|
+
#
|
22
|
+
# The splitter might be initialized without the sentence,
|
23
|
+
# but should be set using the accessor before first call to
|
24
|
+
# +each+ method.
|
25
|
+
def initialize(sentence=nil)
|
26
|
+
@sentence = sentence
|
27
|
+
end
|
28
|
+
|
29
|
+
# This method iterates over the words in the sentence.
|
30
|
+
# It yields the string representation of the word and
|
31
|
+
# its type, which is one of:
|
32
|
+
# * +:word+ - a regular word (including words containing numbers, like A4)
|
33
|
+
# * +:number+ - a number (including number with spaces, dashes, slashes, etc.)
|
34
|
+
# * +:punct+ - single punctuation character (comma, semicolon, full stop, etc.)
|
35
|
+
# * +:graph+ - any single graphical (visible) character
|
36
|
+
# * +:other+ - anything which is not covered by the above types (non-visible
|
37
|
+
# characters in particular)
|
38
|
+
def each
|
39
|
+
raise "Invalid argument - sentence is nil" if @sentence.nil?
|
40
|
+
@sentence.scan(SPLIT_RE) do |word,number,punct,graph,other|
|
41
|
+
if !word.nil?
|
42
|
+
yield word, :word
|
43
|
+
elsif !number.nil?
|
44
|
+
yield number, :number
|
45
|
+
elsif !punct.nil?
|
46
|
+
yield punct, :punct
|
47
|
+
elsif !graph.nil?
|
48
|
+
yield graph, :graph
|
49
|
+
else
|
50
|
+
yield other, :other
|
51
|
+
end
|
52
|
+
end
|
53
|
+
end
|
54
|
+
end
|
55
|
+
end
|
56
|
+
end
|
57
|
+
|
data/srx-english.gemspec
ADDED
@@ -0,0 +1,23 @@
|
|
1
|
+
# -*- encoding: utf-8 -*-
|
2
|
+
$:.push File.expand_path("../lib", __FILE__)
|
3
|
+
|
4
|
+
Gem::Specification.new do |s|
|
5
|
+
s.name = "srx-english"
|
6
|
+
s.version = "0.1.0"
|
7
|
+
s.platform = Gem::Platform::RUBY
|
8
|
+
s.authors = ["Aleksander Pohl"]
|
9
|
+
s.email = ["apohllo@o2.pl"]
|
10
|
+
s.homepage = "http://github.com/apohllo/srx2ruby"
|
11
|
+
s.summary = %q{English sentence and word segmentation rules.}
|
12
|
+
s.description = %q{English sentence and word segmentation rules. The sentence segmentation rules are based on Marcin Miłkowski's SRX rules.}
|
13
|
+
|
14
|
+
s.rubyforge_project = "srx-english"
|
15
|
+
s.has_rdoc = false
|
16
|
+
|
17
|
+
s.files = `git ls-files`.split("\n")
|
18
|
+
s.test_files = `git ls-files -- {test,spec,features}/*`.split("\n")
|
19
|
+
s.executables = `git ls-files -- bin/*`.split("\n").map{ |f| File.basename(f) }
|
20
|
+
s.require_paths = ["lib"]
|
21
|
+
|
22
|
+
s.add_dependency("term-ansicolor", ["~> 1.0.5"])
|
23
|
+
end
|
metadata
ADDED
@@ -0,0 +1,77 @@
|
|
1
|
+
--- !ruby/object:Gem::Specification
|
2
|
+
name: srx-english
|
3
|
+
version: !ruby/object:Gem::Version
|
4
|
+
prerelease:
|
5
|
+
version: 0.1.0
|
6
|
+
platform: ruby
|
7
|
+
authors:
|
8
|
+
- Aleksander Pohl
|
9
|
+
autorequire:
|
10
|
+
bindir: bin
|
11
|
+
cert_chain: []
|
12
|
+
|
13
|
+
date: 2012-04-19 00:00:00 Z
|
14
|
+
dependencies:
|
15
|
+
- !ruby/object:Gem::Dependency
|
16
|
+
name: term-ansicolor
|
17
|
+
prerelease: false
|
18
|
+
requirement: &id001 !ruby/object:Gem::Requirement
|
19
|
+
none: false
|
20
|
+
requirements:
|
21
|
+
- - ~>
|
22
|
+
- !ruby/object:Gem::Version
|
23
|
+
version: 1.0.5
|
24
|
+
type: :runtime
|
25
|
+
version_requirements: *id001
|
26
|
+
description: "English sentence and word segmentation rules. The sentence segmentation rules are based on Marcin Mi\xC5\x82kowski's SRX rules."
|
27
|
+
email:
|
28
|
+
- apohllo@o2.pl
|
29
|
+
executables: []
|
30
|
+
|
31
|
+
extensions: []
|
32
|
+
|
33
|
+
extra_rdoc_files: []
|
34
|
+
|
35
|
+
files:
|
36
|
+
- .gitignore
|
37
|
+
- README.rdoc
|
38
|
+
- changelog.txt
|
39
|
+
- features/sentence_splitter.feature
|
40
|
+
- features/steps/sentence_splitter.rb
|
41
|
+
- features/steps/word_splitter.rb
|
42
|
+
- features/word_splitter.feature
|
43
|
+
- lib/srx/english/sentence_splitter.rb
|
44
|
+
- lib/srx/english/word_splitter.rb
|
45
|
+
- srx-english.gemspec
|
46
|
+
homepage: http://github.com/apohllo/srx2ruby
|
47
|
+
licenses: []
|
48
|
+
|
49
|
+
post_install_message:
|
50
|
+
rdoc_options: []
|
51
|
+
|
52
|
+
require_paths:
|
53
|
+
- lib
|
54
|
+
required_ruby_version: !ruby/object:Gem::Requirement
|
55
|
+
none: false
|
56
|
+
requirements:
|
57
|
+
- - ">="
|
58
|
+
- !ruby/object:Gem::Version
|
59
|
+
version: "0"
|
60
|
+
required_rubygems_version: !ruby/object:Gem::Requirement
|
61
|
+
none: false
|
62
|
+
requirements:
|
63
|
+
- - ">="
|
64
|
+
- !ruby/object:Gem::Version
|
65
|
+
version: "0"
|
66
|
+
requirements: []
|
67
|
+
|
68
|
+
rubyforge_project: srx-english
|
69
|
+
rubygems_version: 1.8.21
|
70
|
+
signing_key:
|
71
|
+
specification_version: 3
|
72
|
+
summary: English sentence and word segmentation rules.
|
73
|
+
test_files:
|
74
|
+
- features/sentence_splitter.feature
|
75
|
+
- features/steps/sentence_splitter.rb
|
76
|
+
- features/steps/word_splitter.rb
|
77
|
+
- features/word_splitter.feature
|