srx-english 0.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
data/.gitignore ADDED
@@ -0,0 +1,3 @@
1
+ .*.sw?
2
+ work
3
+ pkg
data/README.rdoc ADDED
@@ -0,0 +1,75 @@
1
+ == srx-english
2
+
3
+ * https://github.com/apohllo/srx-english
4
+
5
+ = DESCRIPTION
6
+
7
+ 'srx-english' is a Ruby library containing English sentence and word segmentation rules.
8
+ The sentence segementation rules are based on rules defined by Marcin Miłkowski:
9
+ http://morfologik.blogspot.com/2009/11/talking-about-srx-in-lt-during-ltc.html
10
+
11
+ = FEATURES/PROBLEMS
12
+
13
+ * this library is generated by 'srx2ruby' which has some limitations and might
14
+ be not 100% SRX standard compliant.
15
+
16
+ = INSTALL
17
+
18
+ Standard rubygems installation:
19
+
20
+ $ gem install srx-english
21
+
22
+ = BASIC USAGE
23
+
24
+ The library defines the SRX::English::Sentence class allowing to iterate
25
+ over the matched sentences:
26
+
27
+ require 'srx/english/sentence_splitter'
28
+
29
+ text =<<-END
30
+ This is e.g. Mr. Smith, who talks slowly... And this is another sentence.
31
+ END
32
+
33
+ splitter = SRX::English::SentenceSplitter.new(text)
34
+ splitter.each do |sentence|
35
+ puts sentence.gsub(/\n|\r/,"")
36
+ end
37
+ # This is e.g. Mr. Smith, who talks slowly...
38
+ # And this is another sentence.
39
+
40
+ require 'srx/english/word_splitter'
41
+
42
+ sentence = 'My home is my castle.'
43
+ splitter = SRX::English::WordSplitter.new(sentence)
44
+ splitter.each do |word,type|
45
+ puts "'#{word}' #{type}"
46
+ end
47
+ # 'My' word
48
+ # ' ' other
49
+ # 'home' word
50
+ # ' ' other
51
+ # 'is' word
52
+ # ' ' other
53
+ # 'my' word
54
+ # ' ' other
55
+ # 'castle' word
56
+ # '.' punct
57
+
58
+
59
+ == LICENSE
60
+
61
+ Copyright (C) 2011 Aleksander Pohl, Marcin Miłkowski, Jarosław Lipski
62
+
63
+ This program is free software: you can redistribute it and/or modify
64
+ it under the terms of the GNU General Public License as published by
65
+ the Free Software Foundation, either version 3 of the License, or
66
+ (at your option) any later version.
67
+
68
+ This program is distributed in the hope that it will be useful,
69
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
70
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
71
+ GNU General Public License for more details.
72
+
73
+ == FEEDBACK
74
+
75
+ * mailto:apohllo@o2.pl
data/changelog.txt ADDED
@@ -0,0 +1,2 @@
1
+ 0.1.0
2
+ - First release of sentence segmentation rules for English
@@ -0,0 +1,34 @@
1
+ Feature: sentence splitter
2
+ Scenario: splitting text
3
+ Given a text
4
+ """
5
+ It [really!] works.
6
+ """
7
+ When the text is split
8
+ Then the following sentences should be detected
9
+ | sentence |
10
+ #-------------------- #
11
+ | It [really!] works. |
12
+
13
+ Given a text
14
+ """
15
+ This is e.g. Mr. Smith, who talks slowly... And this is another sentence.
16
+ """
17
+ When the text is split
18
+ Then the following sentences should be detected
19
+ | sentence |
20
+ #---------------------------------------------#
21
+ | This is e.g. Mr. Smith, who talks slowly... |
22
+ | And this is another sentence. |
23
+
24
+ Given a text
25
+ """
26
+ Leave me alone!, he yelled. I am in the U.S. Army. Charles (Ind.) said he.
27
+ """
28
+ When the text is split
29
+ Then the following sentences should be detected
30
+ | sentence |
31
+ #-----------------------------#
32
+ | Leave me alone!, he yelled. |
33
+ | I am in the U.S. Army. |
34
+ | Charles (Ind.) said he. |
@@ -0,0 +1,17 @@
1
+ # encoding: utf-8
2
+ $:.unshift "lib"
3
+ require 'srx/english/sentence_splitter'
4
+
5
+ Given /^a text$/ do |text|
6
+ @text = text
7
+ end
8
+
9
+ When /^the text is split$/ do
10
+ @splitter = SRX::English::SentenceSplitter.new(@text)
11
+ end
12
+
13
+ Then /^the following sentences should be detected$/ do |table|
14
+ table.hashes.zip(@splitter.to_a).each do |expected,returned|
15
+ returned.gsub(/\s*\n/," ").strip.should == expected[:sentence]
16
+ end
17
+ end
@@ -0,0 +1,26 @@
1
+ # encoding: utf-8
2
+ $:.unshift "lib"
3
+ #require 'srx/english/word_splitter'
4
+
5
+ Given /^a sentence '([^']+)'$/ do |sentence|
6
+ @sentence = sentence.force_encoding('utf-8')
7
+ end
8
+
9
+ When /^the sentence is split$/ do
10
+ @splitter = SRX::English::WordSplitter.new(@sentence)
11
+ end
12
+
13
+ Then /^the following segments should be detected$/ do |table|
14
+ table.hashes.zip(@splitter.to_a).each do |expected,returned|
15
+ returned[0].should == expected[:segment].gsub(/'/,"")
16
+ returned[1].should == expected[:type].to_sym
17
+ end
18
+ end
19
+
20
+ Then /^the following non-blank segments should be detected$/ do |table|
21
+ segments = @splitter.select{|s| s[1] != :other}
22
+ table.hashes.zip(segments).each do |expected,returned|
23
+ returned[0].should == expected[:segment].gsub(/'/,"")
24
+ returned[1].should == expected[:type].to_sym
25
+ end
26
+ end
@@ -0,0 +1,17 @@
1
+ Feature: word splitter
2
+ Scenario: splitting a sentence
3
+ Given a sentence 'My home is my castle.'
4
+ When the sentence is split
5
+ Then the following segments should be detected
6
+ | segment | type |
7
+ #-----------------#
8
+ | My | word |
9
+ | ' ' | other |
10
+ | home | word |
11
+ | ' ' | other |
12
+ | is | word |
13
+ | ' ' | other |
14
+ | my | word |
15
+ | ' ' | other |
16
+ | castle | word |
17
+ | . | punct |
@@ -0,0 +1,96 @@
1
+ #encoding: utf-8
2
+ require 'stringio'
3
+ require 'term/ansicolor'
4
+ module SRX
5
+ module English
6
+ RULES =
7
+ [["(?:['\"„][\\.!?…]['\"”]\\s)|(?:[^\\.]\\s[A-Z]\\.\\s)|(?:\\b(?:St|Gen|Hon|Prof|Dr|Mr|Ms|Mrs|[JS]r|Col|Maj|Brig|Sgt|Capt|Cmnd|Sen|Rev|Rep|Revd)\\.\\s)|(?:\\b(?:St|Gen|Hon|Prof|Dr|Mr|Ms|Mrs|[JS]r|Col|Maj|Brig|Sgt|Capt|Cmnd|Sen|Rev|Rep|Revd)\\.\\s[A-Z]\\.\\s)|(?:\\bApr\\.\\s)|(?:\\bAug\\.\\s)|(?:\\bBros\\.\\s)|(?:\\bCo\\.\\s)|(?:\\bCorp\\.\\s)|(?:\\bDec\\.\\s)|(?:\\bDist\\.\\s)|(?:\\bFeb\\.\\s)|(?:\\bInc\\.\\s)|(?:\\bJan\\.\\s)|(?:\\bJul\\.\\s)|(?:\\bJun\\.\\s)|(?:\\bMar\\.\\s)|(?:\\bNov\\.\\s)|(?:\\bOct\\.\\s)|(?:\\bPh\\.?D\\.\\s)|(?:\\bSept?\\.\\s)|(?:\\b\\p{Lu}\\.\\p{Lu}\\.\\s)|(?:\\b\\p{Lu}\\.\\s\\p{Lu}\\.\\s)|(?:\\bcf\\.\\s)|(?:\\be\\.g\\.\\s)|(?:\\besp\\.\\s)|(?:\\bet\\b\\s\\bal\\.\\s)|(?:\\bvs\\.\\s)|(?:\\p{Ps}[!?]+\\p{Pe} )",
8
+ nil,
9
+ false],
10
+ ["(?:[\\.\\s]\\p{L}{1,2}\\.\\s)", "[\\p{N}\\p{Ll}]", false],
11
+ ["(?:[\\[\\(]*\\.\\.\\.[\\]\\)]* )", "[^\\p{Lu}]", false],
12
+ ["(?:\\b(?:pp|[Vv]iz|i\\.?\\s*e|[Vvol]|[Rr]col|maj|Lt|[Ff]ig|[Ff]igs|[Vv]iz|[Vv]ols|[Aa]pprox|[Ii]ncl|Pres|[Dd]ept|min|max|[Gg]ovt|lb|ft|c\\.?\\s*f|vs)\\.\\s)",
13
+ "[^\\p{Lu}]|I",
14
+ false],
15
+ ["(?:\\b[Ee]tc\\.\\s)", "[^p{Lu}]", false],
16
+ ["(?:[\\.!?…]+\\p{Pe} )|(?:[\\[\\(]*…[\\]\\)]* )", "\\p{Ll}", false],
17
+ ["(?:\\b\\p{L}\\.)", "\\p{L}\\.", false],
18
+ ["(?:\\b\\p{L}\\.\\s)", "\\p{L}\\.\\s", false],
19
+ ["(?:\\b[Ff]igs?\\.\\s)|(?:\\b[nN]o\\.\\s)", "\\p{N}", false],
20
+ ["(?:[\"”']\\s*)", "\\s*\\p{Ll}", false],
21
+ ["(?:[\\.!?…][\\u00BB\\u2019\\u201D\\u203A\"'\\p{Pe}\\u0002]*\\s)|(?:\\r?\\n)",
22
+ nil,
23
+ true],
24
+ ["(?:[\\.!?…]['\"\\u00BB\\u2019\\u201D\\u203A\\p{Pe}\\u0002]*)",
25
+ "\\p{Lu}[^\\p{Lu}]",
26
+ true],
27
+ ["(?:\\s\\p{L}[\\.!?…]\\s)", "\\p{Lu}\\p{Ll}", true]]
28
+ BEFORE_RE = /(?:#{RULES.map{|s,e,v| "(#{s})"}.join("|")})\Z/m
29
+ REGEXPS = RULES.map{|s,e,v| [/(#{s})\Z/m,/\A(#{e})/m,v] }
30
+ FIRST_CHAR = /\A./m
31
+
32
+
33
+ class SentenceSplitter
34
+ include Enumerable
35
+
36
+ attr_accessor :input
37
+ attr_writer :debug
38
+
39
+ # The sentence splitter is initialized with the +text+ to split.
40
+ # This might be a String or a IO object.
41
+ def initialize(text=nil)
42
+ if text.is_a?(String)
43
+ @input = StringIO.new(text,"r:utf-8")
44
+ else
45
+ @input = text
46
+ end
47
+ end
48
+
49
+ # Iterate over the sentences in the text.
50
+ # If the text is nil, exception is raised.
51
+ def each
52
+ raise "Invalid argument - text is nil" if @input.nil?
53
+ buffer_length = 10
54
+ sentence = ""
55
+ before_buffer = ""
56
+ @input.pos = 0
57
+ after_buffer = buffer_length.times.map{|i| @input.readchar}.join("")
58
+ matched_rule = nil
59
+ while(!@input.eof?) do
60
+ matched_before = BEFORE_RE.match(before_buffer)
61
+ break_detected = false
62
+ if matched_before
63
+ start_index = (matched_before.size - 1).times.find do |index|
64
+ matched_before[index+1]
65
+ end
66
+ if @debug
67
+ puts "#{before_buffer}|#{after_buffer.gsub(/\n/,"\\n")}"
68
+ end
69
+ REGEXPS.each do |before_re,after_re,value|
70
+ # skip the whole match
71
+ if before_re.match(before_buffer) && after_re.match(after_buffer)
72
+ break_detected = true
73
+ color = value ? :red : :green
74
+ if @debug
75
+ sentence << Term::ANSIColor.send(color,"<#{before_re}:#{after_re}>")
76
+ end
77
+ if value
78
+ yield sentence
79
+ sentence = ""
80
+ end
81
+ break
82
+ end
83
+ end
84
+ end
85
+ next_after = @input.readchar
86
+ before_buffer.sub!(FIRST_CHAR,"") if before_buffer.size >= buffer_length
87
+ after_buffer.sub!(FIRST_CHAR,"")
88
+ before_buffer << $&
89
+ sentence << $&
90
+ after_buffer << next_after
91
+ end
92
+ yield sentence + after_buffer unless sentence.empty? || after_buffer.empty?
93
+ end
94
+ end
95
+ end
96
+ end
@@ -0,0 +1,57 @@
1
+ # encoding: utf-8
2
+
3
+ module SRX
4
+ module English
5
+ class WordSplitter
6
+ include Enumerable
7
+
8
+ attr_accessor :sentence
9
+ SPLIT_RULES = {
10
+ :word => "\\p{Alpha}\\p{Word}*",
11
+ :number => "\\p{Digit}+(?:[:., _/-]\\p{Digit}+)*",
12
+ :punct => "\\p{Punct}",
13
+ :graph => "\\p{Graph}",
14
+ :other => "[^\\p{Word}\\p{Graph}]+"
15
+ }
16
+
17
+ SPLIT_RE = /#{SPLIT_RULES.values.map{|v| "(#{v})"}.join("|")}/m
18
+
19
+ # The initializer accepts a +sentence+, which might be a
20
+ # Sentence instance or a String instance.
21
+ #
22
+ # The splitter might be initialized without the sentence,
23
+ # but should be set using the accessor before first call to
24
+ # +each+ method.
25
+ def initialize(sentence=nil)
26
+ @sentence = sentence
27
+ end
28
+
29
+ # This method iterates over the words in the sentence.
30
+ # It yields the string representation of the word and
31
+ # its type, which is one of:
32
+ # * +:word+ - a regular word (including words containing numbers, like A4)
33
+ # * +:number+ - a number (including number with spaces, dashes, slashes, etc.)
34
+ # * +:punct+ - single punctuation character (comma, semicolon, full stop, etc.)
35
+ # * +:graph+ - any single graphical (visible) character
36
+ # * +:other+ - anything which is not covered by the above types (non-visible
37
+ # characters in particular)
38
+ def each
39
+ raise "Invalid argument - sentence is nil" if @sentence.nil?
40
+ @sentence.scan(SPLIT_RE) do |word,number,punct,graph,other|
41
+ if !word.nil?
42
+ yield word, :word
43
+ elsif !number.nil?
44
+ yield number, :number
45
+ elsif !punct.nil?
46
+ yield punct, :punct
47
+ elsif !graph.nil?
48
+ yield graph, :graph
49
+ else
50
+ yield other, :other
51
+ end
52
+ end
53
+ end
54
+ end
55
+ end
56
+ end
57
+
@@ -0,0 +1,23 @@
1
+ # -*- encoding: utf-8 -*-
2
+ $:.push File.expand_path("../lib", __FILE__)
3
+
4
+ Gem::Specification.new do |s|
5
+ s.name = "srx-english"
6
+ s.version = "0.1.0"
7
+ s.platform = Gem::Platform::RUBY
8
+ s.authors = ["Aleksander Pohl"]
9
+ s.email = ["apohllo@o2.pl"]
10
+ s.homepage = "http://github.com/apohllo/srx2ruby"
11
+ s.summary = %q{English sentence and word segmentation rules.}
12
+ s.description = %q{English sentence and word segmentation rules. The sentence segmentation rules are based on Marcin Miłkowski's SRX rules.}
13
+
14
+ s.rubyforge_project = "srx-english"
15
+ s.has_rdoc = false
16
+
17
+ s.files = `git ls-files`.split("\n")
18
+ s.test_files = `git ls-files -- {test,spec,features}/*`.split("\n")
19
+ s.executables = `git ls-files -- bin/*`.split("\n").map{ |f| File.basename(f) }
20
+ s.require_paths = ["lib"]
21
+
22
+ s.add_dependency("term-ansicolor", ["~> 1.0.5"])
23
+ end
metadata ADDED
@@ -0,0 +1,77 @@
1
+ --- !ruby/object:Gem::Specification
2
+ name: srx-english
3
+ version: !ruby/object:Gem::Version
4
+ prerelease:
5
+ version: 0.1.0
6
+ platform: ruby
7
+ authors:
8
+ - Aleksander Pohl
9
+ autorequire:
10
+ bindir: bin
11
+ cert_chain: []
12
+
13
+ date: 2012-04-19 00:00:00 Z
14
+ dependencies:
15
+ - !ruby/object:Gem::Dependency
16
+ name: term-ansicolor
17
+ prerelease: false
18
+ requirement: &id001 !ruby/object:Gem::Requirement
19
+ none: false
20
+ requirements:
21
+ - - ~>
22
+ - !ruby/object:Gem::Version
23
+ version: 1.0.5
24
+ type: :runtime
25
+ version_requirements: *id001
26
+ description: "English sentence and word segmentation rules. The sentence segmentation rules are based on Marcin Mi\xC5\x82kowski's SRX rules."
27
+ email:
28
+ - apohllo@o2.pl
29
+ executables: []
30
+
31
+ extensions: []
32
+
33
+ extra_rdoc_files: []
34
+
35
+ files:
36
+ - .gitignore
37
+ - README.rdoc
38
+ - changelog.txt
39
+ - features/sentence_splitter.feature
40
+ - features/steps/sentence_splitter.rb
41
+ - features/steps/word_splitter.rb
42
+ - features/word_splitter.feature
43
+ - lib/srx/english/sentence_splitter.rb
44
+ - lib/srx/english/word_splitter.rb
45
+ - srx-english.gemspec
46
+ homepage: http://github.com/apohllo/srx2ruby
47
+ licenses: []
48
+
49
+ post_install_message:
50
+ rdoc_options: []
51
+
52
+ require_paths:
53
+ - lib
54
+ required_ruby_version: !ruby/object:Gem::Requirement
55
+ none: false
56
+ requirements:
57
+ - - ">="
58
+ - !ruby/object:Gem::Version
59
+ version: "0"
60
+ required_rubygems_version: !ruby/object:Gem::Requirement
61
+ none: false
62
+ requirements:
63
+ - - ">="
64
+ - !ruby/object:Gem::Version
65
+ version: "0"
66
+ requirements: []
67
+
68
+ rubyforge_project: srx-english
69
+ rubygems_version: 1.8.21
70
+ signing_key:
71
+ specification_version: 3
72
+ summary: English sentence and word segmentation rules.
73
+ test_files:
74
+ - features/sentence_splitter.feature
75
+ - features/steps/sentence_splitter.rb
76
+ - features/steps/word_splitter.rb
77
+ - features/word_splitter.feature