rudge 0.0.2
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +7 -0
- data/lib/rudge.rb +23 -0
- data/lib/rudge/abbreviations.rb +22 -0
- metadata +45 -0
checksums.yaml
ADDED
@@ -0,0 +1,7 @@
|
|
1
|
+
---
|
2
|
+
SHA1:
|
3
|
+
metadata.gz: a8ed66b8eeec5d42fc40a8dd59567508795bd056
|
4
|
+
data.tar.gz: c4e51bd25dd55ddd1ea5821b853dfa7d8df7e6f1
|
5
|
+
SHA512:
|
6
|
+
metadata.gz: 8e62a34ee8f52c7ce5a6762d97fb973c94dd9a68f26935546ec66f4db6479b506a718c8cbe2b3b5f19536a74888229f03e7f96374ac20f7cfd8c7dbf063fa6b3
|
7
|
+
data.tar.gz: 4b7c58b69d7401a58c4e19339ac3c13866f80e46b7f9cd8dd9cd93438363da317a0a2fd2e1579a30b0ef6219f39c6457ca80ff2ca277745433678e8b99fc9637
|
data/lib/rudge.rb
ADDED
@@ -0,0 +1,23 @@
|
|
1
|
+
class Rudge
|
2
|
+
require 'rudge/abbreviations'
|
3
|
+
|
4
|
+
# end of sentence marker
|
5
|
+
EOS = "\001"
|
6
|
+
|
7
|
+
def self.sentences(text)
|
8
|
+
text = text.dup
|
9
|
+
|
10
|
+
# initial split after punctuation,
|
11
|
+
# preserves trailing whitespace for the ellipsis correction
|
12
|
+
text.gsub!(/([\.?!](?:\"|\'|\)|\]|\})?)(\s+)/) { $1 << EOS << $2 }
|
13
|
+
|
14
|
+
# correct ellipsis marks and rows of stops
|
15
|
+
text.gsub!(/(\.\.\.*)#{EOS}/) { $1 }
|
16
|
+
|
17
|
+
# correct abbreviations - precompile regexp?
|
18
|
+
text.gsub!(/(#{Rudge::Abbreviations.list.join("|")})\.#{EOS}/i) { $1 << '.' }
|
19
|
+
|
20
|
+
# split on EOS marker, strip gets rid of trailing whitespace
|
21
|
+
text.split(EOS).map { | sentence | sentence.strip }
|
22
|
+
end
|
23
|
+
end
|
@@ -0,0 +1,22 @@
|
|
1
|
+
class Rudge
|
2
|
+
class Abbreviations
|
3
|
+
TITLES = %w(jr, mr, mrs, ms, dr, prof, sr, sen, rep, rev, gov, atty,
|
4
|
+
supt, det, rev, col, gen, lt, cmdr, adm, capt, sgt, cpl, maj)
|
5
|
+
|
6
|
+
ENTITIES = %w(dept, univ, uni, assn, bros, inc, ltd, co, corp, plc)
|
7
|
+
|
8
|
+
MONTHS = %w(jan, feb, mar, apr, may, jun, jul, aug, sep, sept, oct,
|
9
|
+
nov, dec)
|
10
|
+
|
11
|
+
DAYS = %w(mon, tue, wed, thu, fri, sat, sun)
|
12
|
+
|
13
|
+
ADDRESSES = %w(ave, bld, blvd, cl, ct, cres, dr, rd, st)
|
14
|
+
|
15
|
+
MISC = %w(vs, etc, no, esp, cf)
|
16
|
+
|
17
|
+
def self.list
|
18
|
+
TITLES + ENTITIES + MONTHS + DAYS + ADDRESSES + MISC
|
19
|
+
end
|
20
|
+
|
21
|
+
end
|
22
|
+
end
|
metadata
ADDED
@@ -0,0 +1,45 @@
|
|
1
|
+
--- !ruby/object:Gem::Specification
|
2
|
+
name: rudge
|
3
|
+
version: !ruby/object:Gem::Version
|
4
|
+
version: 0.0.2
|
5
|
+
platform: ruby
|
6
|
+
authors:
|
7
|
+
- bruno coelho
|
8
|
+
autorequire:
|
9
|
+
bindir: bin
|
10
|
+
cert_chain: []
|
11
|
+
date: 2014-01-31 00:00:00.000000000 Z
|
12
|
+
dependencies: []
|
13
|
+
description: A Ruby Judge! Determines the sentences in a text.
|
14
|
+
email: bruno.coelho@gmail.com
|
15
|
+
executables: []
|
16
|
+
extensions: []
|
17
|
+
extra_rdoc_files: []
|
18
|
+
files:
|
19
|
+
- lib/rudge.rb
|
20
|
+
- lib/rudge/abbreviations.rb
|
21
|
+
homepage: http://rubygems.org/gems/rudge
|
22
|
+
licenses:
|
23
|
+
- MIT
|
24
|
+
metadata: {}
|
25
|
+
post_install_message:
|
26
|
+
rdoc_options: []
|
27
|
+
require_paths:
|
28
|
+
- lib
|
29
|
+
required_ruby_version: !ruby/object:Gem::Requirement
|
30
|
+
requirements:
|
31
|
+
- - '>='
|
32
|
+
- !ruby/object:Gem::Version
|
33
|
+
version: '0'
|
34
|
+
required_rubygems_version: !ruby/object:Gem::Requirement
|
35
|
+
requirements:
|
36
|
+
- - '>='
|
37
|
+
- !ruby/object:Gem::Version
|
38
|
+
version: '0'
|
39
|
+
requirements: []
|
40
|
+
rubyforge_project:
|
41
|
+
rubygems_version: 2.2.1
|
42
|
+
signing_key:
|
43
|
+
specification_version: 4
|
44
|
+
summary: Rudge - determines sentences.
|
45
|
+
test_files: []
|