rudge 0.0.7 → 0.0.8

Sign up to get free protection for your applications and to get access to all the features.
Files changed (3) hide show
  1. checksums.yaml +4 -4
  2. data/lib/rudge.rb +18 -12
  3. metadata +3 -3
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA1:
3
- metadata.gz: 2fce88b22b8874cc54365bcf58e2fcf216fecb8a
4
- data.tar.gz: 74b020bbcc59f8974cb3512ce0377d836401a1de
3
+ metadata.gz: 7b599e54d26036ce43a7a04b7d8d0147faa7ce3a
4
+ data.tar.gz: 0aa3264aaf613e84b5de6a16f65d27d02d9e076b
5
5
  SHA512:
6
- metadata.gz: 65bd3060fa479677a8a786e7f6795df02dfb55ecb6367079ce234b8c4a59e512dad804f66840259bd4a5a256ebc700facd406a69f43a08fc8b667d80843a3e39
7
- data.tar.gz: 38475cd6f0b53d486f4760f225887e010d73d9efc0acaa58eac6841ed195edb5b4f7b88030fe93ba72266dc24d19826b86c8847be376c33044755ea215a02c0c
6
+ metadata.gz: 7556b1e408626af47b5218052e408b9bec645e84a6534e1a9c7deb9e29a2e6d4766bbe1d13cb02b36c5f40ddcd12d269ffebcb9b8988c9c2365d7a85671fc0e2
7
+ data.tar.gz: a8f66f6b00e9f2c508c7051cf254fce4420c2993708a188fdc056aadc64d81b248ae0ec023f6806574c5cad0a818dc92b91cf542821a70174d6677454c352e22
@@ -1,23 +1,29 @@
1
1
  require "rudge/abbreviations"
2
2
 
3
3
  class Rudge
4
- # end of sentence marker
5
- EOS = "\001"
4
+ # end of sentence marker (before and after whitespace)
5
+ EOSBW = "\001"
6
+ EOSAW = "\002"
6
7
 
7
- def self.sentences(text)
8
+ def self.sentences(text, options={})
8
9
  text = text.dup
9
10
 
10
- # initial split after punctuation,
11
- # preserves trailing whitespace for the ellipsis correction
12
- text.gsub!(/([\.?!](?:\"|\'|\)|\]|\})?)(\s+)/) { $1 << EOS << $2 }
11
+ # initial sentence markers, before and after whitespace
12
+ text.gsub!(/([.?!](?:\"|\'|\)|\]|\})?)(\s+)/) { $1 << EOSBW << $2 << EOSAW }
13
13
 
14
- # correct ellipsis marks and rows of stops
15
- text.gsub!(/(\.\.\.*)#{EOS}/) { $1 }
14
+ # remove sentence markers on ellipsis
15
+ text.gsub!(/(\.\.\.*)#{EOSBW}(\s+)#{EOSAW}/) { $1 << $2 }
16
16
 
17
- # correct abbreviations - precompile regexp?
18
- text.gsub!(/\s(#{Rudge::Abbreviations.list.join("|")})\.#{EOS}/i) { " " << $1 << "." }
17
+ # remove sentence markers on abbreviations
18
+ abbreviations = Rudge::Abbreviations.list.join("|")
19
+ text.gsub!(/(\s)(#{abbreviations})\.#{EOSBW}(\s+)#{EOSAW}/i) { $1 << $2 << "." << $3 }
19
20
 
20
- # split on EOS marker, strip gets rid of trailing whitespace
21
- text.split(EOS).map { | sentence | sentence.strip }
21
+ if options[:keep_trailling_whitespace]
22
+ # split after whitespace, remove EOSBW marker
23
+ text.split(EOSAW).map { | sentence | sentence.gsub(EOSBW, "") }
24
+ else
25
+ # remove initial whitespace, split at markers
26
+ text.gsub(/\A\s+/, "").split(/#{EOSBW}\s+#{EOSAW}/)
27
+ end
22
28
  end
23
29
  end
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: rudge
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.0.7
4
+ version: 0.0.8
5
5
  platform: ruby
6
6
  authors:
7
7
  - bruno coelho
8
8
  autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
- date: 2014-02-21 00:00:00.000000000 Z
11
+ date: 2014-10-09 00:00:00.000000000 Z
12
12
  dependencies: []
13
13
  description: A Ruby Judge! Determines the sentences in a text.
14
14
  email: bruno.coelho@gmail.com
@@ -38,7 +38,7 @@ required_rubygems_version: !ruby/object:Gem::Requirement
38
38
  version: '0'
39
39
  requirements: []
40
40
  rubyforge_project:
41
- rubygems_version: 2.0.14
41
+ rubygems_version: 2.4.1
42
42
  signing_key:
43
43
  specification_version: 4
44
44
  summary: Rudge - determines sentences.