srx-english 0.1.0

Sign up to get free protection for your applications and to get access to all the features.
data/.gitignore ADDED
@@ -0,0 +1,3 @@
1
+ .*.sw?
2
+ work
3
+ pkg
data/README.rdoc ADDED
@@ -0,0 +1,75 @@
1
+ == srx-english
2
+
3
+ * https://github.com/apohllo/srx-english
4
+
5
+ = DESCRIPTION
6
+
7
+ 'srx-english' is a Ruby library containing English sentence and word segmentation rules.
8
+ The sentence segementation rules are based on rules defined by Marcin Miłkowski:
9
+ http://morfologik.blogspot.com/2009/11/talking-about-srx-in-lt-during-ltc.html
10
+
11
+ = FEATURES/PROBLEMS
12
+
13
+ * this library is generated by 'srx2ruby' which has some limitations and might
14
+ be not 100% SRX standard compliant.
15
+
16
+ = INSTALL
17
+
18
+ Standard rubygems installation:
19
+
20
+ $ gem install srx-english
21
+
22
+ = BASIC USAGE
23
+
24
+ The library defines the SRX::English::Sentence class allowing to iterate
25
+ over the matched sentences:
26
+
27
+ require 'srx/english/sentence_splitter'
28
+
29
+ text =<<-END
30
+ This is e.g. Mr. Smith, who talks slowly... And this is another sentence.
31
+ END
32
+
33
+ splitter = SRX::English::SentenceSplitter.new(text)
34
+ splitter.each do |sentence|
35
+ puts sentence.gsub(/\n|\r/,"")
36
+ end
37
+ # This is e.g. Mr. Smith, who talks slowly...
38
+ # And this is another sentence.
39
+
40
+ require 'srx/english/word_splitter'
41
+
42
+ sentence = 'My home is my castle.'
43
+ splitter = SRX::English::WordSplitter.new(sentence)
44
+ splitter.each do |word,type|
45
+ puts "'#{word}' #{type}"
46
+ end
47
+ # 'My' word
48
+ # ' ' other
49
+ # 'home' word
50
+ # ' ' other
51
+ # 'is' word
52
+ # ' ' other
53
+ # 'my' word
54
+ # ' ' other
55
+ # 'castle' word
56
+ # '.' punct
57
+
58
+
59
+ == LICENSE
60
+
61
+ Copyright (C) 2011 Aleksander Pohl, Marcin Miłkowski, Jarosław Lipski
62
+
63
+ This program is free software: you can redistribute it and/or modify
64
+ it under the terms of the GNU General Public License as published by
65
+ the Free Software Foundation, either version 3 of the License, or
66
+ (at your option) any later version.
67
+
68
+ This program is distributed in the hope that it will be useful,
69
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
70
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
71
+ GNU General Public License for more details.
72
+
73
+ == FEEDBACK
74
+
75
+ * mailto:apohllo@o2.pl
data/changelog.txt ADDED
@@ -0,0 +1,2 @@
1
+ 0.1.0
2
+ - First release of sentence segmentation rules for English
@@ -0,0 +1,34 @@
1
+ Feature: sentence splitter
2
+ Scenario: splitting text
3
+ Given a text
4
+ """
5
+ It [really!] works.
6
+ """
7
+ When the text is split
8
+ Then the following sentences should be detected
9
+ | sentence |
10
+ #-------------------- #
11
+ | It [really!] works. |
12
+
13
+ Given a text
14
+ """
15
+ This is e.g. Mr. Smith, who talks slowly... And this is another sentence.
16
+ """
17
+ When the text is split
18
+ Then the following sentences should be detected
19
+ | sentence |
20
+ #---------------------------------------------#
21
+ | This is e.g. Mr. Smith, who talks slowly... |
22
+ | And this is another sentence. |
23
+
24
+ Given a text
25
+ """
26
+ Leave me alone!, he yelled. I am in the U.S. Army. Charles (Ind.) said he.
27
+ """
28
+ When the text is split
29
+ Then the following sentences should be detected
30
+ | sentence |
31
+ #-----------------------------#
32
+ | Leave me alone!, he yelled. |
33
+ | I am in the U.S. Army. |
34
+ | Charles (Ind.) said he. |
@@ -0,0 +1,17 @@
1
+ # encoding: utf-8
2
+ $:.unshift "lib"
3
+ require 'srx/english/sentence_splitter'
4
+
5
+ Given /^a text$/ do |text|
6
+ @text = text
7
+ end
8
+
9
+ When /^the text is split$/ do
10
+ @splitter = SRX::English::SentenceSplitter.new(@text)
11
+ end
12
+
13
+ Then /^the following sentences should be detected$/ do |table|
14
+ table.hashes.zip(@splitter.to_a).each do |expected,returned|
15
+ returned.gsub(/\s*\n/," ").strip.should == expected[:sentence]
16
+ end
17
+ end
@@ -0,0 +1,26 @@
1
+ # encoding: utf-8
2
+ $:.unshift "lib"
3
+ #require 'srx/english/word_splitter'
4
+
5
+ Given /^a sentence '([^']+)'$/ do |sentence|
6
+ @sentence = sentence.force_encoding('utf-8')
7
+ end
8
+
9
+ When /^the sentence is split$/ do
10
+ @splitter = SRX::English::WordSplitter.new(@sentence)
11
+ end
12
+
13
+ Then /^the following segments should be detected$/ do |table|
14
+ table.hashes.zip(@splitter.to_a).each do |expected,returned|
15
+ returned[0].should == expected[:segment].gsub(/'/,"")
16
+ returned[1].should == expected[:type].to_sym
17
+ end
18
+ end
19
+
20
+ Then /^the following non-blank segments should be detected$/ do |table|
21
+ segments = @splitter.select{|s| s[1] != :other}
22
+ table.hashes.zip(segments).each do |expected,returned|
23
+ returned[0].should == expected[:segment].gsub(/'/,"")
24
+ returned[1].should == expected[:type].to_sym
25
+ end
26
+ end
@@ -0,0 +1,17 @@
1
+ Feature: word splitter
2
+ Scenario: splitting a sentence
3
+ Given a sentence 'My home is my castle.'
4
+ When the sentence is split
5
+ Then the following segments should be detected
6
+ | segment | type |
7
+ #-----------------#
8
+ | My | word |
9
+ | ' ' | other |
10
+ | home | word |
11
+ | ' ' | other |
12
+ | is | word |
13
+ | ' ' | other |
14
+ | my | word |
15
+ | ' ' | other |
16
+ | castle | word |
17
+ | . | punct |
@@ -0,0 +1,96 @@
1
+ #encoding: utf-8
2
+ require 'stringio'
3
+ require 'term/ansicolor'
4
+ module SRX
5
+ module English
6
+ RULES =
7
+ [["(?:['\"„][\\.!?…]['\"”]\\s)|(?:[^\\.]\\s[A-Z]\\.\\s)|(?:\\b(?:St|Gen|Hon|Prof|Dr|Mr|Ms|Mrs|[JS]r|Col|Maj|Brig|Sgt|Capt|Cmnd|Sen|Rev|Rep|Revd)\\.\\s)|(?:\\b(?:St|Gen|Hon|Prof|Dr|Mr|Ms|Mrs|[JS]r|Col|Maj|Brig|Sgt|Capt|Cmnd|Sen|Rev|Rep|Revd)\\.\\s[A-Z]\\.\\s)|(?:\\bApr\\.\\s)|(?:\\bAug\\.\\s)|(?:\\bBros\\.\\s)|(?:\\bCo\\.\\s)|(?:\\bCorp\\.\\s)|(?:\\bDec\\.\\s)|(?:\\bDist\\.\\s)|(?:\\bFeb\\.\\s)|(?:\\bInc\\.\\s)|(?:\\bJan\\.\\s)|(?:\\bJul\\.\\s)|(?:\\bJun\\.\\s)|(?:\\bMar\\.\\s)|(?:\\bNov\\.\\s)|(?:\\bOct\\.\\s)|(?:\\bPh\\.?D\\.\\s)|(?:\\bSept?\\.\\s)|(?:\\b\\p{Lu}\\.\\p{Lu}\\.\\s)|(?:\\b\\p{Lu}\\.\\s\\p{Lu}\\.\\s)|(?:\\bcf\\.\\s)|(?:\\be\\.g\\.\\s)|(?:\\besp\\.\\s)|(?:\\bet\\b\\s\\bal\\.\\s)|(?:\\bvs\\.\\s)|(?:\\p{Ps}[!?]+\\p{Pe} )",
8
+ nil,
9
+ false],
10
+ ["(?:[\\.\\s]\\p{L}{1,2}\\.\\s)", "[\\p{N}\\p{Ll}]", false],
11
+ ["(?:[\\[\\(]*\\.\\.\\.[\\]\\)]* )", "[^\\p{Lu}]", false],
12
+ ["(?:\\b(?:pp|[Vv]iz|i\\.?\\s*e|[Vvol]|[Rr]col|maj|Lt|[Ff]ig|[Ff]igs|[Vv]iz|[Vv]ols|[Aa]pprox|[Ii]ncl|Pres|[Dd]ept|min|max|[Gg]ovt|lb|ft|c\\.?\\s*f|vs)\\.\\s)",
13
+ "[^\\p{Lu}]|I",
14
+ false],
15
+ ["(?:\\b[Ee]tc\\.\\s)", "[^p{Lu}]", false],
16
+ ["(?:[\\.!?…]+\\p{Pe} )|(?:[\\[\\(]*…[\\]\\)]* )", "\\p{Ll}", false],
17
+ ["(?:\\b\\p{L}\\.)", "\\p{L}\\.", false],
18
+ ["(?:\\b\\p{L}\\.\\s)", "\\p{L}\\.\\s", false],
19
+ ["(?:\\b[Ff]igs?\\.\\s)|(?:\\b[nN]o\\.\\s)", "\\p{N}", false],
20
+ ["(?:[\"”']\\s*)", "\\s*\\p{Ll}", false],
21
+ ["(?:[\\.!?…][\\u00BB\\u2019\\u201D\\u203A\"'\\p{Pe}\\u0002]*\\s)|(?:\\r?\\n)",
22
+ nil,
23
+ true],
24
+ ["(?:[\\.!?…]['\"\\u00BB\\u2019\\u201D\\u203A\\p{Pe}\\u0002]*)",
25
+ "\\p{Lu}[^\\p{Lu}]",
26
+ true],
27
+ ["(?:\\s\\p{L}[\\.!?…]\\s)", "\\p{Lu}\\p{Ll}", true]]
28
+ BEFORE_RE = /(?:#{RULES.map{|s,e,v| "(#{s})"}.join("|")})\Z/m
29
+ REGEXPS = RULES.map{|s,e,v| [/(#{s})\Z/m,/\A(#{e})/m,v] }
30
+ FIRST_CHAR = /\A./m
31
+
32
+
33
+ class SentenceSplitter
34
+ include Enumerable
35
+
36
+ attr_accessor :input
37
+ attr_writer :debug
38
+
39
+ # The sentence splitter is initialized with the +text+ to split.
40
+ # This might be a String or a IO object.
41
+ def initialize(text=nil)
42
+ if text.is_a?(String)
43
+ @input = StringIO.new(text,"r:utf-8")
44
+ else
45
+ @input = text
46
+ end
47
+ end
48
+
49
+ # Iterate over the sentences in the text.
50
+ # If the text is nil, exception is raised.
51
+ def each
52
+ raise "Invalid argument - text is nil" if @input.nil?
53
+ buffer_length = 10
54
+ sentence = ""
55
+ before_buffer = ""
56
+ @input.pos = 0
57
+ after_buffer = buffer_length.times.map{|i| @input.readchar}.join("")
58
+ matched_rule = nil
59
+ while(!@input.eof?) do
60
+ matched_before = BEFORE_RE.match(before_buffer)
61
+ break_detected = false
62
+ if matched_before
63
+ start_index = (matched_before.size - 1).times.find do |index|
64
+ matched_before[index+1]
65
+ end
66
+ if @debug
67
+ puts "#{before_buffer}|#{after_buffer.gsub(/\n/,"\\n")}"
68
+ end
69
+ REGEXPS.each do |before_re,after_re,value|
70
+ # skip the whole match
71
+ if before_re.match(before_buffer) && after_re.match(after_buffer)
72
+ break_detected = true
73
+ color = value ? :red : :green
74
+ if @debug
75
+ sentence << Term::ANSIColor.send(color,"<#{before_re}:#{after_re}>")
76
+ end
77
+ if value
78
+ yield sentence
79
+ sentence = ""
80
+ end
81
+ break
82
+ end
83
+ end
84
+ end
85
+ next_after = @input.readchar
86
+ before_buffer.sub!(FIRST_CHAR,"") if before_buffer.size >= buffer_length
87
+ after_buffer.sub!(FIRST_CHAR,"")
88
+ before_buffer << $&
89
+ sentence << $&
90
+ after_buffer << next_after
91
+ end
92
+ yield sentence + after_buffer unless sentence.empty? || after_buffer.empty?
93
+ end
94
+ end
95
+ end
96
+ end
@@ -0,0 +1,57 @@
1
+ # encoding: utf-8
2
+
3
+ module SRX
4
+ module English
5
+ class WordSplitter
6
+ include Enumerable
7
+
8
+ attr_accessor :sentence
9
+ SPLIT_RULES = {
10
+ :word => "\\p{Alpha}\\p{Word}*",
11
+ :number => "\\p{Digit}+(?:[:., _/-]\\p{Digit}+)*",
12
+ :punct => "\\p{Punct}",
13
+ :graph => "\\p{Graph}",
14
+ :other => "[^\\p{Word}\\p{Graph}]+"
15
+ }
16
+
17
+ SPLIT_RE = /#{SPLIT_RULES.values.map{|v| "(#{v})"}.join("|")}/m
18
+
19
+ # The initializer accepts a +sentence+, which might be a
20
+ # Sentence instance or a String instance.
21
+ #
22
+ # The splitter might be initialized without the sentence,
23
+ # but should be set using the accessor before first call to
24
+ # +each+ method.
25
+ def initialize(sentence=nil)
26
+ @sentence = sentence
27
+ end
28
+
29
+ # This method iterates over the words in the sentence.
30
+ # It yields the string representation of the word and
31
+ # its type, which is one of:
32
+ # * +:word+ - a regular word (including words containing numbers, like A4)
33
+ # * +:number+ - a number (including number with spaces, dashes, slashes, etc.)
34
+ # * +:punct+ - single punctuation character (comma, semicolon, full stop, etc.)
35
+ # * +:graph+ - any single graphical (visible) character
36
+ # * +:other+ - anything which is not covered by the above types (non-visible
37
+ # characters in particular)
38
+ def each
39
+ raise "Invalid argument - sentence is nil" if @sentence.nil?
40
+ @sentence.scan(SPLIT_RE) do |word,number,punct,graph,other|
41
+ if !word.nil?
42
+ yield word, :word
43
+ elsif !number.nil?
44
+ yield number, :number
45
+ elsif !punct.nil?
46
+ yield punct, :punct
47
+ elsif !graph.nil?
48
+ yield graph, :graph
49
+ else
50
+ yield other, :other
51
+ end
52
+ end
53
+ end
54
+ end
55
+ end
56
+ end
57
+
@@ -0,0 +1,23 @@
1
+ # -*- encoding: utf-8 -*-
2
+ $:.push File.expand_path("../lib", __FILE__)
3
+
4
+ Gem::Specification.new do |s|
5
+ s.name = "srx-english"
6
+ s.version = "0.1.0"
7
+ s.platform = Gem::Platform::RUBY
8
+ s.authors = ["Aleksander Pohl"]
9
+ s.email = ["apohllo@o2.pl"]
10
+ s.homepage = "http://github.com/apohllo/srx2ruby"
11
+ s.summary = %q{English sentence and word segmentation rules.}
12
+ s.description = %q{English sentence and word segmentation rules. The sentence segmentation rules are based on Marcin Miłkowski's SRX rules.}
13
+
14
+ s.rubyforge_project = "srx-english"
15
+ s.has_rdoc = false
16
+
17
+ s.files = `git ls-files`.split("\n")
18
+ s.test_files = `git ls-files -- {test,spec,features}/*`.split("\n")
19
+ s.executables = `git ls-files -- bin/*`.split("\n").map{ |f| File.basename(f) }
20
+ s.require_paths = ["lib"]
21
+
22
+ s.add_dependency("term-ansicolor", ["~> 1.0.5"])
23
+ end
metadata ADDED
@@ -0,0 +1,77 @@
1
+ --- !ruby/object:Gem::Specification
2
+ name: srx-english
3
+ version: !ruby/object:Gem::Version
4
+ prerelease:
5
+ version: 0.1.0
6
+ platform: ruby
7
+ authors:
8
+ - Aleksander Pohl
9
+ autorequire:
10
+ bindir: bin
11
+ cert_chain: []
12
+
13
+ date: 2012-04-19 00:00:00 Z
14
+ dependencies:
15
+ - !ruby/object:Gem::Dependency
16
+ name: term-ansicolor
17
+ prerelease: false
18
+ requirement: &id001 !ruby/object:Gem::Requirement
19
+ none: false
20
+ requirements:
21
+ - - ~>
22
+ - !ruby/object:Gem::Version
23
+ version: 1.0.5
24
+ type: :runtime
25
+ version_requirements: *id001
26
+ description: "English sentence and word segmentation rules. The sentence segmentation rules are based on Marcin Mi\xC5\x82kowski's SRX rules."
27
+ email:
28
+ - apohllo@o2.pl
29
+ executables: []
30
+
31
+ extensions: []
32
+
33
+ extra_rdoc_files: []
34
+
35
+ files:
36
+ - .gitignore
37
+ - README.rdoc
38
+ - changelog.txt
39
+ - features/sentence_splitter.feature
40
+ - features/steps/sentence_splitter.rb
41
+ - features/steps/word_splitter.rb
42
+ - features/word_splitter.feature
43
+ - lib/srx/english/sentence_splitter.rb
44
+ - lib/srx/english/word_splitter.rb
45
+ - srx-english.gemspec
46
+ homepage: http://github.com/apohllo/srx2ruby
47
+ licenses: []
48
+
49
+ post_install_message:
50
+ rdoc_options: []
51
+
52
+ require_paths:
53
+ - lib
54
+ required_ruby_version: !ruby/object:Gem::Requirement
55
+ none: false
56
+ requirements:
57
+ - - ">="
58
+ - !ruby/object:Gem::Version
59
+ version: "0"
60
+ required_rubygems_version: !ruby/object:Gem::Requirement
61
+ none: false
62
+ requirements:
63
+ - - ">="
64
+ - !ruby/object:Gem::Version
65
+ version: "0"
66
+ requirements: []
67
+
68
+ rubyforge_project: srx-english
69
+ rubygems_version: 1.8.21
70
+ signing_key:
71
+ specification_version: 3
72
+ summary: English sentence and word segmentation rules.
73
+ test_files:
74
+ - features/sentence_splitter.feature
75
+ - features/steps/sentence_splitter.rb
76
+ - features/steps/word_splitter.rb
77
+ - features/word_splitter.feature