srx-english 0.1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/.gitignore +3 -0
- data/README.rdoc +75 -0
- data/changelog.txt +2 -0
- data/features/sentence_splitter.feature +34 -0
- data/features/steps/sentence_splitter.rb +17 -0
- data/features/steps/word_splitter.rb +26 -0
- data/features/word_splitter.feature +17 -0
- data/lib/srx/english/sentence_splitter.rb +96 -0
- data/lib/srx/english/word_splitter.rb +57 -0
- data/srx-english.gemspec +23 -0
- metadata +77 -0
data/.gitignore
ADDED
data/README.rdoc
ADDED
@@ -0,0 +1,75 @@
|
|
1
|
+
== srx-english
|
2
|
+
|
3
|
+
* https://github.com/apohllo/srx-english
|
4
|
+
|
5
|
+
= DESCRIPTION
|
6
|
+
|
7
|
+
'srx-english' is a Ruby library containing English sentence and word segmentation rules.
|
8
|
+
The sentence segementation rules are based on rules defined by Marcin Miłkowski:
|
9
|
+
http://morfologik.blogspot.com/2009/11/talking-about-srx-in-lt-during-ltc.html
|
10
|
+
|
11
|
+
= FEATURES/PROBLEMS
|
12
|
+
|
13
|
+
* this library is generated by 'srx2ruby' which has some limitations and might
|
14
|
+
be not 100% SRX standard compliant.
|
15
|
+
|
16
|
+
= INSTALL
|
17
|
+
|
18
|
+
Standard rubygems installation:
|
19
|
+
|
20
|
+
$ gem install srx-english
|
21
|
+
|
22
|
+
= BASIC USAGE
|
23
|
+
|
24
|
+
The library defines the SRX::English::Sentence class allowing to iterate
|
25
|
+
over the matched sentences:
|
26
|
+
|
27
|
+
require 'srx/english/sentence_splitter'
|
28
|
+
|
29
|
+
text =<<-END
|
30
|
+
This is e.g. Mr. Smith, who talks slowly... And this is another sentence.
|
31
|
+
END
|
32
|
+
|
33
|
+
splitter = SRX::English::SentenceSplitter.new(text)
|
34
|
+
splitter.each do |sentence|
|
35
|
+
puts sentence.gsub(/\n|\r/,"")
|
36
|
+
end
|
37
|
+
# This is e.g. Mr. Smith, who talks slowly...
|
38
|
+
# And this is another sentence.
|
39
|
+
|
40
|
+
require 'srx/english/word_splitter'
|
41
|
+
|
42
|
+
sentence = 'My home is my castle.'
|
43
|
+
splitter = SRX::English::WordSplitter.new(sentence)
|
44
|
+
splitter.each do |word,type|
|
45
|
+
puts "'#{word}' #{type}"
|
46
|
+
end
|
47
|
+
# 'My' word
|
48
|
+
# ' ' other
|
49
|
+
# 'home' word
|
50
|
+
# ' ' other
|
51
|
+
# 'is' word
|
52
|
+
# ' ' other
|
53
|
+
# 'my' word
|
54
|
+
# ' ' other
|
55
|
+
# 'castle' word
|
56
|
+
# '.' punct
|
57
|
+
|
58
|
+
|
59
|
+
== LICENSE
|
60
|
+
|
61
|
+
Copyright (C) 2011 Aleksander Pohl, Marcin Miłkowski, Jarosław Lipski
|
62
|
+
|
63
|
+
This program is free software: you can redistribute it and/or modify
|
64
|
+
it under the terms of the GNU General Public License as published by
|
65
|
+
the Free Software Foundation, either version 3 of the License, or
|
66
|
+
(at your option) any later version.
|
67
|
+
|
68
|
+
This program is distributed in the hope that it will be useful,
|
69
|
+
but WITHOUT ANY WARRANTY; without even the implied warranty of
|
70
|
+
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
71
|
+
GNU General Public License for more details.
|
72
|
+
|
73
|
+
== FEEDBACK
|
74
|
+
|
75
|
+
* mailto:apohllo@o2.pl
|
data/changelog.txt
ADDED
@@ -0,0 +1,34 @@
|
|
1
|
+
Feature: sentence splitter
|
2
|
+
Scenario: splitting text
|
3
|
+
Given a text
|
4
|
+
"""
|
5
|
+
It [really!] works.
|
6
|
+
"""
|
7
|
+
When the text is split
|
8
|
+
Then the following sentences should be detected
|
9
|
+
| sentence |
|
10
|
+
#-------------------- #
|
11
|
+
| It [really!] works. |
|
12
|
+
|
13
|
+
Given a text
|
14
|
+
"""
|
15
|
+
This is e.g. Mr. Smith, who talks slowly... And this is another sentence.
|
16
|
+
"""
|
17
|
+
When the text is split
|
18
|
+
Then the following sentences should be detected
|
19
|
+
| sentence |
|
20
|
+
#---------------------------------------------#
|
21
|
+
| This is e.g. Mr. Smith, who talks slowly... |
|
22
|
+
| And this is another sentence. |
|
23
|
+
|
24
|
+
Given a text
|
25
|
+
"""
|
26
|
+
Leave me alone!, he yelled. I am in the U.S. Army. Charles (Ind.) said he.
|
27
|
+
"""
|
28
|
+
When the text is split
|
29
|
+
Then the following sentences should be detected
|
30
|
+
| sentence |
|
31
|
+
#-----------------------------#
|
32
|
+
| Leave me alone!, he yelled. |
|
33
|
+
| I am in the U.S. Army. |
|
34
|
+
| Charles (Ind.) said he. |
|
@@ -0,0 +1,17 @@
|
|
1
|
+
# encoding: utf-8
|
2
|
+
$:.unshift "lib"
|
3
|
+
require 'srx/english/sentence_splitter'
|
4
|
+
|
5
|
+
Given /^a text$/ do |text|
|
6
|
+
@text = text
|
7
|
+
end
|
8
|
+
|
9
|
+
When /^the text is split$/ do
|
10
|
+
@splitter = SRX::English::SentenceSplitter.new(@text)
|
11
|
+
end
|
12
|
+
|
13
|
+
Then /^the following sentences should be detected$/ do |table|
|
14
|
+
table.hashes.zip(@splitter.to_a).each do |expected,returned|
|
15
|
+
returned.gsub(/\s*\n/," ").strip.should == expected[:sentence]
|
16
|
+
end
|
17
|
+
end
|
@@ -0,0 +1,26 @@
|
|
1
|
+
# encoding: utf-8
|
2
|
+
$:.unshift "lib"
|
3
|
+
#require 'srx/english/word_splitter'
|
4
|
+
|
5
|
+
Given /^a sentence '([^']+)'$/ do |sentence|
|
6
|
+
@sentence = sentence.force_encoding('utf-8')
|
7
|
+
end
|
8
|
+
|
9
|
+
When /^the sentence is split$/ do
|
10
|
+
@splitter = SRX::English::WordSplitter.new(@sentence)
|
11
|
+
end
|
12
|
+
|
13
|
+
Then /^the following segments should be detected$/ do |table|
|
14
|
+
table.hashes.zip(@splitter.to_a).each do |expected,returned|
|
15
|
+
returned[0].should == expected[:segment].gsub(/'/,"")
|
16
|
+
returned[1].should == expected[:type].to_sym
|
17
|
+
end
|
18
|
+
end
|
19
|
+
|
20
|
+
Then /^the following non-blank segments should be detected$/ do |table|
|
21
|
+
segments = @splitter.select{|s| s[1] != :other}
|
22
|
+
table.hashes.zip(segments).each do |expected,returned|
|
23
|
+
returned[0].should == expected[:segment].gsub(/'/,"")
|
24
|
+
returned[1].should == expected[:type].to_sym
|
25
|
+
end
|
26
|
+
end
|
@@ -0,0 +1,17 @@
|
|
1
|
+
Feature: word splitter
|
2
|
+
Scenario: splitting a sentence
|
3
|
+
Given a sentence 'My home is my castle.'
|
4
|
+
When the sentence is split
|
5
|
+
Then the following segments should be detected
|
6
|
+
| segment | type |
|
7
|
+
#-----------------#
|
8
|
+
| My | word |
|
9
|
+
| ' ' | other |
|
10
|
+
| home | word |
|
11
|
+
| ' ' | other |
|
12
|
+
| is | word |
|
13
|
+
| ' ' | other |
|
14
|
+
| my | word |
|
15
|
+
| ' ' | other |
|
16
|
+
| castle | word |
|
17
|
+
| . | punct |
|
@@ -0,0 +1,96 @@
|
|
1
|
+
#encoding: utf-8
|
2
|
+
require 'stringio'
|
3
|
+
require 'term/ansicolor'
|
4
|
+
module SRX
|
5
|
+
module English
|
6
|
+
RULES =
|
7
|
+
[["(?:['\"„][\\.!?…]['\"”]\\s)|(?:[^\\.]\\s[A-Z]\\.\\s)|(?:\\b(?:St|Gen|Hon|Prof|Dr|Mr|Ms|Mrs|[JS]r|Col|Maj|Brig|Sgt|Capt|Cmnd|Sen|Rev|Rep|Revd)\\.\\s)|(?:\\b(?:St|Gen|Hon|Prof|Dr|Mr|Ms|Mrs|[JS]r|Col|Maj|Brig|Sgt|Capt|Cmnd|Sen|Rev|Rep|Revd)\\.\\s[A-Z]\\.\\s)|(?:\\bApr\\.\\s)|(?:\\bAug\\.\\s)|(?:\\bBros\\.\\s)|(?:\\bCo\\.\\s)|(?:\\bCorp\\.\\s)|(?:\\bDec\\.\\s)|(?:\\bDist\\.\\s)|(?:\\bFeb\\.\\s)|(?:\\bInc\\.\\s)|(?:\\bJan\\.\\s)|(?:\\bJul\\.\\s)|(?:\\bJun\\.\\s)|(?:\\bMar\\.\\s)|(?:\\bNov\\.\\s)|(?:\\bOct\\.\\s)|(?:\\bPh\\.?D\\.\\s)|(?:\\bSept?\\.\\s)|(?:\\b\\p{Lu}\\.\\p{Lu}\\.\\s)|(?:\\b\\p{Lu}\\.\\s\\p{Lu}\\.\\s)|(?:\\bcf\\.\\s)|(?:\\be\\.g\\.\\s)|(?:\\besp\\.\\s)|(?:\\bet\\b\\s\\bal\\.\\s)|(?:\\bvs\\.\\s)|(?:\\p{Ps}[!?]+\\p{Pe} )",
|
8
|
+
nil,
|
9
|
+
false],
|
10
|
+
["(?:[\\.\\s]\\p{L}{1,2}\\.\\s)", "[\\p{N}\\p{Ll}]", false],
|
11
|
+
["(?:[\\[\\(]*\\.\\.\\.[\\]\\)]* )", "[^\\p{Lu}]", false],
|
12
|
+
["(?:\\b(?:pp|[Vv]iz|i\\.?\\s*e|[Vvol]|[Rr]col|maj|Lt|[Ff]ig|[Ff]igs|[Vv]iz|[Vv]ols|[Aa]pprox|[Ii]ncl|Pres|[Dd]ept|min|max|[Gg]ovt|lb|ft|c\\.?\\s*f|vs)\\.\\s)",
|
13
|
+
"[^\\p{Lu}]|I",
|
14
|
+
false],
|
15
|
+
["(?:\\b[Ee]tc\\.\\s)", "[^p{Lu}]", false],
|
16
|
+
["(?:[\\.!?…]+\\p{Pe} )|(?:[\\[\\(]*…[\\]\\)]* )", "\\p{Ll}", false],
|
17
|
+
["(?:\\b\\p{L}\\.)", "\\p{L}\\.", false],
|
18
|
+
["(?:\\b\\p{L}\\.\\s)", "\\p{L}\\.\\s", false],
|
19
|
+
["(?:\\b[Ff]igs?\\.\\s)|(?:\\b[nN]o\\.\\s)", "\\p{N}", false],
|
20
|
+
["(?:[\"”']\\s*)", "\\s*\\p{Ll}", false],
|
21
|
+
["(?:[\\.!?…][\\u00BB\\u2019\\u201D\\u203A\"'\\p{Pe}\\u0002]*\\s)|(?:\\r?\\n)",
|
22
|
+
nil,
|
23
|
+
true],
|
24
|
+
["(?:[\\.!?…]['\"\\u00BB\\u2019\\u201D\\u203A\\p{Pe}\\u0002]*)",
|
25
|
+
"\\p{Lu}[^\\p{Lu}]",
|
26
|
+
true],
|
27
|
+
["(?:\\s\\p{L}[\\.!?…]\\s)", "\\p{Lu}\\p{Ll}", true]]
|
28
|
+
BEFORE_RE = /(?:#{RULES.map{|s,e,v| "(#{s})"}.join("|")})\Z/m
|
29
|
+
REGEXPS = RULES.map{|s,e,v| [/(#{s})\Z/m,/\A(#{e})/m,v] }
|
30
|
+
FIRST_CHAR = /\A./m
|
31
|
+
|
32
|
+
|
33
|
+
class SentenceSplitter
|
34
|
+
include Enumerable
|
35
|
+
|
36
|
+
attr_accessor :input
|
37
|
+
attr_writer :debug
|
38
|
+
|
39
|
+
# The sentence splitter is initialized with the +text+ to split.
|
40
|
+
# This might be a String or a IO object.
|
41
|
+
def initialize(text=nil)
|
42
|
+
if text.is_a?(String)
|
43
|
+
@input = StringIO.new(text,"r:utf-8")
|
44
|
+
else
|
45
|
+
@input = text
|
46
|
+
end
|
47
|
+
end
|
48
|
+
|
49
|
+
# Iterate over the sentences in the text.
|
50
|
+
# If the text is nil, exception is raised.
|
51
|
+
def each
|
52
|
+
raise "Invalid argument - text is nil" if @input.nil?
|
53
|
+
buffer_length = 10
|
54
|
+
sentence = ""
|
55
|
+
before_buffer = ""
|
56
|
+
@input.pos = 0
|
57
|
+
after_buffer = buffer_length.times.map{|i| @input.readchar}.join("")
|
58
|
+
matched_rule = nil
|
59
|
+
while(!@input.eof?) do
|
60
|
+
matched_before = BEFORE_RE.match(before_buffer)
|
61
|
+
break_detected = false
|
62
|
+
if matched_before
|
63
|
+
start_index = (matched_before.size - 1).times.find do |index|
|
64
|
+
matched_before[index+1]
|
65
|
+
end
|
66
|
+
if @debug
|
67
|
+
puts "#{before_buffer}|#{after_buffer.gsub(/\n/,"\\n")}"
|
68
|
+
end
|
69
|
+
REGEXPS.each do |before_re,after_re,value|
|
70
|
+
# skip the whole match
|
71
|
+
if before_re.match(before_buffer) && after_re.match(after_buffer)
|
72
|
+
break_detected = true
|
73
|
+
color = value ? :red : :green
|
74
|
+
if @debug
|
75
|
+
sentence << Term::ANSIColor.send(color,"<#{before_re}:#{after_re}>")
|
76
|
+
end
|
77
|
+
if value
|
78
|
+
yield sentence
|
79
|
+
sentence = ""
|
80
|
+
end
|
81
|
+
break
|
82
|
+
end
|
83
|
+
end
|
84
|
+
end
|
85
|
+
next_after = @input.readchar
|
86
|
+
before_buffer.sub!(FIRST_CHAR,"") if before_buffer.size >= buffer_length
|
87
|
+
after_buffer.sub!(FIRST_CHAR,"")
|
88
|
+
before_buffer << $&
|
89
|
+
sentence << $&
|
90
|
+
after_buffer << next_after
|
91
|
+
end
|
92
|
+
yield sentence + after_buffer unless sentence.empty? || after_buffer.empty?
|
93
|
+
end
|
94
|
+
end
|
95
|
+
end
|
96
|
+
end
|
@@ -0,0 +1,57 @@
|
|
1
|
+
# encoding: utf-8
|
2
|
+
|
3
|
+
module SRX
|
4
|
+
module English
|
5
|
+
class WordSplitter
|
6
|
+
include Enumerable
|
7
|
+
|
8
|
+
attr_accessor :sentence
|
9
|
+
SPLIT_RULES = {
|
10
|
+
:word => "\\p{Alpha}\\p{Word}*",
|
11
|
+
:number => "\\p{Digit}+(?:[:., _/-]\\p{Digit}+)*",
|
12
|
+
:punct => "\\p{Punct}",
|
13
|
+
:graph => "\\p{Graph}",
|
14
|
+
:other => "[^\\p{Word}\\p{Graph}]+"
|
15
|
+
}
|
16
|
+
|
17
|
+
SPLIT_RE = /#{SPLIT_RULES.values.map{|v| "(#{v})"}.join("|")}/m
|
18
|
+
|
19
|
+
# The initializer accepts a +sentence+, which might be a
|
20
|
+
# Sentence instance or a String instance.
|
21
|
+
#
|
22
|
+
# The splitter might be initialized without the sentence,
|
23
|
+
# but should be set using the accessor before first call to
|
24
|
+
# +each+ method.
|
25
|
+
def initialize(sentence=nil)
|
26
|
+
@sentence = sentence
|
27
|
+
end
|
28
|
+
|
29
|
+
# This method iterates over the words in the sentence.
|
30
|
+
# It yields the string representation of the word and
|
31
|
+
# its type, which is one of:
|
32
|
+
# * +:word+ - a regular word (including words containing numbers, like A4)
|
33
|
+
# * +:number+ - a number (including number with spaces, dashes, slashes, etc.)
|
34
|
+
# * +:punct+ - single punctuation character (comma, semicolon, full stop, etc.)
|
35
|
+
# * +:graph+ - any single graphical (visible) character
|
36
|
+
# * +:other+ - anything which is not covered by the above types (non-visible
|
37
|
+
# characters in particular)
|
38
|
+
def each
|
39
|
+
raise "Invalid argument - sentence is nil" if @sentence.nil?
|
40
|
+
@sentence.scan(SPLIT_RE) do |word,number,punct,graph,other|
|
41
|
+
if !word.nil?
|
42
|
+
yield word, :word
|
43
|
+
elsif !number.nil?
|
44
|
+
yield number, :number
|
45
|
+
elsif !punct.nil?
|
46
|
+
yield punct, :punct
|
47
|
+
elsif !graph.nil?
|
48
|
+
yield graph, :graph
|
49
|
+
else
|
50
|
+
yield other, :other
|
51
|
+
end
|
52
|
+
end
|
53
|
+
end
|
54
|
+
end
|
55
|
+
end
|
56
|
+
end
|
57
|
+
|
data/srx-english.gemspec
ADDED
@@ -0,0 +1,23 @@
|
|
1
|
+
# -*- encoding: utf-8 -*-
|
2
|
+
$:.push File.expand_path("../lib", __FILE__)
|
3
|
+
|
4
|
+
Gem::Specification.new do |s|
|
5
|
+
s.name = "srx-english"
|
6
|
+
s.version = "0.1.0"
|
7
|
+
s.platform = Gem::Platform::RUBY
|
8
|
+
s.authors = ["Aleksander Pohl"]
|
9
|
+
s.email = ["apohllo@o2.pl"]
|
10
|
+
s.homepage = "http://github.com/apohllo/srx2ruby"
|
11
|
+
s.summary = %q{English sentence and word segmentation rules.}
|
12
|
+
s.description = %q{English sentence and word segmentation rules. The sentence segmentation rules are based on Marcin Miłkowski's SRX rules.}
|
13
|
+
|
14
|
+
s.rubyforge_project = "srx-english"
|
15
|
+
s.has_rdoc = false
|
16
|
+
|
17
|
+
s.files = `git ls-files`.split("\n")
|
18
|
+
s.test_files = `git ls-files -- {test,spec,features}/*`.split("\n")
|
19
|
+
s.executables = `git ls-files -- bin/*`.split("\n").map{ |f| File.basename(f) }
|
20
|
+
s.require_paths = ["lib"]
|
21
|
+
|
22
|
+
s.add_dependency("term-ansicolor", ["~> 1.0.5"])
|
23
|
+
end
|
metadata
ADDED
@@ -0,0 +1,77 @@
|
|
1
|
+
--- !ruby/object:Gem::Specification
|
2
|
+
name: srx-english
|
3
|
+
version: !ruby/object:Gem::Version
|
4
|
+
prerelease:
|
5
|
+
version: 0.1.0
|
6
|
+
platform: ruby
|
7
|
+
authors:
|
8
|
+
- Aleksander Pohl
|
9
|
+
autorequire:
|
10
|
+
bindir: bin
|
11
|
+
cert_chain: []
|
12
|
+
|
13
|
+
date: 2012-04-19 00:00:00 Z
|
14
|
+
dependencies:
|
15
|
+
- !ruby/object:Gem::Dependency
|
16
|
+
name: term-ansicolor
|
17
|
+
prerelease: false
|
18
|
+
requirement: &id001 !ruby/object:Gem::Requirement
|
19
|
+
none: false
|
20
|
+
requirements:
|
21
|
+
- - ~>
|
22
|
+
- !ruby/object:Gem::Version
|
23
|
+
version: 1.0.5
|
24
|
+
type: :runtime
|
25
|
+
version_requirements: *id001
|
26
|
+
description: "English sentence and word segmentation rules. The sentence segmentation rules are based on Marcin Mi\xC5\x82kowski's SRX rules."
|
27
|
+
email:
|
28
|
+
- apohllo@o2.pl
|
29
|
+
executables: []
|
30
|
+
|
31
|
+
extensions: []
|
32
|
+
|
33
|
+
extra_rdoc_files: []
|
34
|
+
|
35
|
+
files:
|
36
|
+
- .gitignore
|
37
|
+
- README.rdoc
|
38
|
+
- changelog.txt
|
39
|
+
- features/sentence_splitter.feature
|
40
|
+
- features/steps/sentence_splitter.rb
|
41
|
+
- features/steps/word_splitter.rb
|
42
|
+
- features/word_splitter.feature
|
43
|
+
- lib/srx/english/sentence_splitter.rb
|
44
|
+
- lib/srx/english/word_splitter.rb
|
45
|
+
- srx-english.gemspec
|
46
|
+
homepage: http://github.com/apohllo/srx2ruby
|
47
|
+
licenses: []
|
48
|
+
|
49
|
+
post_install_message:
|
50
|
+
rdoc_options: []
|
51
|
+
|
52
|
+
require_paths:
|
53
|
+
- lib
|
54
|
+
required_ruby_version: !ruby/object:Gem::Requirement
|
55
|
+
none: false
|
56
|
+
requirements:
|
57
|
+
- - ">="
|
58
|
+
- !ruby/object:Gem::Version
|
59
|
+
version: "0"
|
60
|
+
required_rubygems_version: !ruby/object:Gem::Requirement
|
61
|
+
none: false
|
62
|
+
requirements:
|
63
|
+
- - ">="
|
64
|
+
- !ruby/object:Gem::Version
|
65
|
+
version: "0"
|
66
|
+
requirements: []
|
67
|
+
|
68
|
+
rubyforge_project: srx-english
|
69
|
+
rubygems_version: 1.8.21
|
70
|
+
signing_key:
|
71
|
+
specification_version: 3
|
72
|
+
summary: English sentence and word segmentation rules.
|
73
|
+
test_files:
|
74
|
+
- features/sentence_splitter.feature
|
75
|
+
- features/steps/sentence_splitter.rb
|
76
|
+
- features/steps/word_splitter.rb
|
77
|
+
- features/word_splitter.feature
|