pedantic 0.1.0
Sign up to get free protection for your applications and to get access to all the features.
- data/LICENSE +20 -0
- data/README.textile +29 -0
- data/lib/pedantic/emoticons.rb +20 -0
- data/lib/pedantic/emphasis.rb +18 -0
- data/lib/pedantic/html.rb +15 -0
- data/lib/pedantic/punctuation.rb +18 -0
- data/lib/pedantic/stems.rb +13 -0
- data/lib/pedantic/txt.rb +16 -0
- data/lib/pedantic/unimportant.rb +19 -0
- data/lib/pedantic/uris.rb +16 -0
- data/lib/pedantic.rb +51 -0
- data/spec/pedantic/emoticons_spec.rb +23 -0
- data/spec/pedantic/emphasis_spec.rb +15 -0
- data/spec/pedantic/html_spec.rb +7 -0
- data/spec/pedantic/punctuation_spec.rb +19 -0
- data/spec/pedantic/stems_spec.rb +11 -0
- data/spec/pedantic/txt_spec.rb +15 -0
- data/spec/pedantic/unimportant_spec.rb +7 -0
- data/spec/pedantic/uris_spec.rb +15 -0
- data/spec/pedantic_spec.rb +17 -0
- data/spec/spec_helper.rb +10 -0
- metadata +104 -0
data/LICENSE
ADDED
@@ -0,0 +1,20 @@
|
|
1
|
+
Copyright (c) 2009 Pat Allan
|
2
|
+
|
3
|
+
Permission is hereby granted, free of charge, to any person obtaining
|
4
|
+
a copy of this software and associated documentation files (the
|
5
|
+
"Software"), to deal in the Software without restriction, including
|
6
|
+
without limitation the rights to use, copy, modify, merge, publish,
|
7
|
+
distribute, sublicense, and/or sell copies of the Software, and to
|
8
|
+
permit persons to whom the Software is furnished to do so, subject to
|
9
|
+
the following conditions:
|
10
|
+
|
11
|
+
The above copyright notice and this permission notice shall be
|
12
|
+
included in all copies or substantial portions of the Software.
|
13
|
+
|
14
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
|
15
|
+
EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
|
16
|
+
MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
|
17
|
+
NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
|
18
|
+
LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
|
19
|
+
OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
|
20
|
+
WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
|
data/README.textile
ADDED
@@ -0,0 +1,29 @@
|
|
1
|
+
h1. Pedantic
|
2
|
+
|
3
|
+
Pedantic cleans strings of text - stripping out unimportant words and URLs, fixing typos, replacing symbols (like emoticons) with real words, and running the results through a stemmer.
|
4
|
+
|
5
|
+
In short - it gives you reliable text to process (but not read).
|
6
|
+
|
7
|
+
And if the name didn't give it away, yes this library is opinionated.
|
8
|
+
|
9
|
+
h2. Installation
|
10
|
+
|
11
|
+
Grab the gem.
|
12
|
+
|
13
|
+
<pre><code>gem install pedantic</code></pre>
|
14
|
+
|
15
|
+
h2. Usage
|
16
|
+
|
17
|
+
<pre><code>Pedantic.fix('my messy string ;)') #=> 'messi string joke'</code></pre>
|
18
|
+
|
19
|
+
Note that the stemmer generates imperfect words, but it is reasonably reliable and constant in the output, so you can work with those assumptions in the output.
|
20
|
+
|
21
|
+
Also - this library is a work in progress - currently I've aimed for a relatively useful but extremely basic implementation. If you look through the code, you'll see there's few typos and emoticons handled. It's easy enough to extend, though - so please, fork, patch and send a pull request.
|
22
|
+
|
23
|
+
h2. Contributing
|
24
|
+
|
25
|
+
Fork and patch as you see fit - and please send me a pull request if you think it's useful for others. Don't forget to write specs first, and don't mess with the version numbers please (or at least: only do so in a different branch).
|
26
|
+
|
27
|
+
h2. Copyright
|
28
|
+
|
29
|
+
Copyright (c) 2010 "Pat Allan":http://freelancing-gods.com, but released under an open licence. Go for your life.
|
@@ -0,0 +1,20 @@
|
|
1
|
+
module Pedantic::Emoticons
|
2
|
+
def self.included(base)
|
3
|
+
base.processors :replace_emoticons
|
4
|
+
end
|
5
|
+
|
6
|
+
def replace_emoticons(string)
|
7
|
+
{
|
8
|
+
/(^|\s)\:\)(\s|$)/ => ' smile ',
|
9
|
+
/(^|\s)\:\((\s|$)/ => ' sad ',
|
10
|
+
/(^|\s)\:D(\s|$)/ => ' happy ',
|
11
|
+
/(^|\s)\:S(\s|$)/ => ' unsure ',
|
12
|
+
/(^|\s)\:s(\s|$)/ => ' unsure ',
|
13
|
+
/(^|\s)\;\)(\s|$)/ => ' joke '
|
14
|
+
}.each do |pattern, replacement|
|
15
|
+
string.gsub!(pattern, replacement)
|
16
|
+
end
|
17
|
+
|
18
|
+
string
|
19
|
+
end
|
20
|
+
end
|
@@ -0,0 +1,18 @@
|
|
1
|
+
module Pedantic::Emphasis
|
2
|
+
def self.included(base)
|
3
|
+
base.processors :fix_emphasis
|
4
|
+
end
|
5
|
+
|
6
|
+
def fix_emphasis(string)
|
7
|
+
{
|
8
|
+
/\bso+\b/ => 'so',
|
9
|
+
/\bre+a+ll+y\b/i => 'really',
|
10
|
+
/\boka+y\b/i => 'okay',
|
11
|
+
/\boo+h/i => 'ooh'
|
12
|
+
}.each { |pattern, replacement|
|
13
|
+
string.gsub!(pattern, replacement)
|
14
|
+
}
|
15
|
+
|
16
|
+
string
|
17
|
+
end
|
18
|
+
end
|
@@ -0,0 +1,18 @@
|
|
1
|
+
module Pedantic::Punctuation
|
2
|
+
def self.included(base)
|
3
|
+
base.processors :replace_punctuation
|
4
|
+
end
|
5
|
+
|
6
|
+
def replace_punctuation(string)
|
7
|
+
{
|
8
|
+
/\bcan't\b/ => 'cannot',
|
9
|
+
/'/ => '',
|
10
|
+
/sh\*t/ => 'shit',
|
11
|
+
/f\*[\*c]k/ => 'fuck'
|
12
|
+
}.each { |pattern, replacement|
|
13
|
+
string.gsub!(pattern, replacement)
|
14
|
+
}
|
15
|
+
|
16
|
+
string
|
17
|
+
end
|
18
|
+
end
|
data/lib/pedantic/txt.rb
ADDED
@@ -0,0 +1,16 @@
|
|
1
|
+
module Pedantic::Txt
|
2
|
+
def self.included(base)
|
3
|
+
base.processors :replace_txt_words
|
4
|
+
end
|
5
|
+
|
6
|
+
def replace_txt_words(string)
|
7
|
+
{
|
8
|
+
/\btn?x\b/i => 'thanks',
|
9
|
+
/\bthnks\b/i => 'thanks'
|
10
|
+
}.each { |pattern, replacement|
|
11
|
+
string.gsub!(pattern, replacement)
|
12
|
+
}
|
13
|
+
|
14
|
+
string
|
15
|
+
end
|
16
|
+
end
|
@@ -0,0 +1,19 @@
|
|
1
|
+
module Pedantic::Unimportant
|
2
|
+
def self.included(base)
|
3
|
+
base.processors :remove_unimportant_words
|
4
|
+
end
|
5
|
+
|
6
|
+
def remove_unimportant_words(string)
|
7
|
+
%w(
|
8
|
+
a again all along also an and are as at but by came can cant couldnt did
|
9
|
+
didn didnt do doesntdont ever first from have her here him how i if in
|
10
|
+
into is isnt it itll just last least like most my new no not now of on or
|
11
|
+
should sinc so some th than this that the their then those to told too
|
12
|
+
true try until url us were when whether while with within yes you youll
|
13
|
+
).each { |word|
|
14
|
+
string.gsub!(/\b#{word}\b/, '')
|
15
|
+
}
|
16
|
+
|
17
|
+
string
|
18
|
+
end
|
19
|
+
end
|
@@ -0,0 +1,16 @@
|
|
1
|
+
module Pedantic::Uris
|
2
|
+
def self.included(base)
|
3
|
+
base.processors :remove_uris
|
4
|
+
end
|
5
|
+
|
6
|
+
def remove_uris(string)
|
7
|
+
[
|
8
|
+
/https?:\/\/\S+/,
|
9
|
+
/([\w\.])+\.(com|co|net|org|info)(\.\w+)?/
|
10
|
+
].each { |pattern|
|
11
|
+
string.gsub!(pattern, '')
|
12
|
+
}
|
13
|
+
|
14
|
+
string
|
15
|
+
end
|
16
|
+
end
|
data/lib/pedantic.rb
ADDED
@@ -0,0 +1,51 @@
|
|
1
|
+
class Pedantic
|
2
|
+
@@processors = []
|
3
|
+
|
4
|
+
def self.fix(string)
|
5
|
+
Pedantic.new(string).to_s
|
6
|
+
end
|
7
|
+
|
8
|
+
def self.processors(*methods)
|
9
|
+
@@processors += methods
|
10
|
+
end
|
11
|
+
|
12
|
+
def initialize(string)
|
13
|
+
@original = string
|
14
|
+
end
|
15
|
+
|
16
|
+
def to_s
|
17
|
+
process
|
18
|
+
end
|
19
|
+
|
20
|
+
private
|
21
|
+
|
22
|
+
def process
|
23
|
+
@current = @original.clone
|
24
|
+
|
25
|
+
@@processors.each do |processor|
|
26
|
+
@current = send(processor, @current)
|
27
|
+
end
|
28
|
+
|
29
|
+
@current.gsub(/\s+/, ' ').strip
|
30
|
+
end
|
31
|
+
end
|
32
|
+
|
33
|
+
require 'pedantic/emphasis'
|
34
|
+
require 'pedantic/emoticons'
|
35
|
+
require 'pedantic/html'
|
36
|
+
require 'pedantic/punctuation'
|
37
|
+
require 'pedantic/stems'
|
38
|
+
require 'pedantic/txt'
|
39
|
+
require 'pedantic/unimportant'
|
40
|
+
require 'pedantic/uris'
|
41
|
+
|
42
|
+
class Pedantic
|
43
|
+
include Pedantic::Uris
|
44
|
+
include Pedantic::Html
|
45
|
+
include Pedantic::Emoticons
|
46
|
+
include Pedantic::Txt
|
47
|
+
include Pedantic::Punctuation
|
48
|
+
include Pedantic::Emphasis
|
49
|
+
include Pedantic::Unimportant
|
50
|
+
include Pedantic::Stems
|
51
|
+
end
|
@@ -0,0 +1,23 @@
|
|
1
|
+
require 'spec/spec_helper'
|
2
|
+
|
3
|
+
describe Pedantic::Emoticons do
|
4
|
+
it "replaces :) with smile" do
|
5
|
+
Pedantic.fix('foo :)').should == 'foo smile'
|
6
|
+
end
|
7
|
+
|
8
|
+
it "replaces :( with sad" do
|
9
|
+
Pedantic.fix('foo :(').should == 'foo sad'
|
10
|
+
end
|
11
|
+
|
12
|
+
it "replaces :D with happy" do
|
13
|
+
Pedantic.fix('foo :D').should == 'foo happi'
|
14
|
+
end
|
15
|
+
|
16
|
+
it "replaces :S and :s with unsure" do
|
17
|
+
Pedantic.fix('foo :S :s').should == 'foo unsur unsur'
|
18
|
+
end
|
19
|
+
|
20
|
+
it "replaces ;) with joke" do
|
21
|
+
Pedantic.fix('foo ;)').should == 'foo joke'
|
22
|
+
end
|
23
|
+
end
|
@@ -0,0 +1,15 @@
|
|
1
|
+
require 'spec/spec_helper'
|
2
|
+
|
3
|
+
describe Pedantic::Emphasis do
|
4
|
+
it "replaces reeeaalllly with realy" do
|
5
|
+
Pedantic.fix('Reaaally reeeaalllly').should == 'realli realli'
|
6
|
+
end
|
7
|
+
|
8
|
+
it "replaces okaaaay with okay" do
|
9
|
+
Pedantic.fix('Okaaaaay okaaay').should == 'okai okai'
|
10
|
+
end
|
11
|
+
|
12
|
+
it "replaces oooooh with ooh" do
|
13
|
+
Pedantic.fix('OoOoOoh').should == 'ooh'
|
14
|
+
end
|
15
|
+
end
|
@@ -0,0 +1,19 @@
|
|
1
|
+
require 'spec/spec_helper'
|
2
|
+
|
3
|
+
describe Pedantic::Punctuation do
|
4
|
+
it "replaces can't with cannot" do
|
5
|
+
Pedantic.fix("can't foo").should == 'cannot foo'
|
6
|
+
end
|
7
|
+
|
8
|
+
it "removes other apostrophes" do
|
9
|
+
Pedantic.fix("isn't it foo").should == 'foo'
|
10
|
+
end
|
11
|
+
|
12
|
+
it "replaces sh*t with shit" do
|
13
|
+
Pedantic.fix('sh*t').should == 'shit'
|
14
|
+
end
|
15
|
+
|
16
|
+
it "replaces f**k and f*ck with fuck" do
|
17
|
+
Pedantic.fix('f**k f*ck').should == 'fuck fuck'
|
18
|
+
end
|
19
|
+
end
|
@@ -0,0 +1,11 @@
|
|
1
|
+
require 'spec/spec_helper'
|
2
|
+
|
3
|
+
describe Pedantic::Stems do
|
4
|
+
it "reduces words to their core stems" do
|
5
|
+
Pedantic.fix('wandering').should == 'wander'
|
6
|
+
end
|
7
|
+
|
8
|
+
it "reduces all words to their core stems" do
|
9
|
+
Pedantic.fix('wandering and running').should == 'wander run'
|
10
|
+
end
|
11
|
+
end
|
@@ -0,0 +1,15 @@
|
|
1
|
+
require 'spec/spec_helper'
|
2
|
+
|
3
|
+
describe Pedantic::Txt do
|
4
|
+
it "replaces tx with thank" do
|
5
|
+
Pedantic.fix('tx').should == 'thank'
|
6
|
+
end
|
7
|
+
|
8
|
+
it "replaces tnx with thank" do
|
9
|
+
Pedantic.fix('tnx').should == 'thank'
|
10
|
+
end
|
11
|
+
|
12
|
+
it "replaces thnks with thank" do
|
13
|
+
Pedantic.fix('thnks').should == 'thank'
|
14
|
+
end
|
15
|
+
end
|
@@ -0,0 +1,15 @@
|
|
1
|
+
require 'spec/spec_helper'
|
2
|
+
|
3
|
+
describe Pedantic::Uris do
|
4
|
+
it "removes explicit web addresses" do
|
5
|
+
Pedantic.fix('foo http://bar.com bar').should == 'foo bar'
|
6
|
+
end
|
7
|
+
|
8
|
+
it "removes explicit https addresses" do
|
9
|
+
Pedantic.fix('foo https://bar.com bar').should == 'foo bar'
|
10
|
+
end
|
11
|
+
|
12
|
+
it "removes implicit web addresses" do
|
13
|
+
Pedantic.fix('foo domain.com bar').should == 'foo bar'
|
14
|
+
end
|
15
|
+
end
|
@@ -0,0 +1,17 @@
|
|
1
|
+
require 'spec/spec_helper'
|
2
|
+
|
3
|
+
describe Pedantic do
|
4
|
+
describe '.fix' do
|
5
|
+
it "removes leading and trailing spaces" do
|
6
|
+
Pedantic.fix(' foo ').should == 'foo'
|
7
|
+
end
|
8
|
+
|
9
|
+
it "replaces multiple spaces with a single space" do
|
10
|
+
Pedantic.fix('foo bar').should == 'foo bar'
|
11
|
+
end
|
12
|
+
|
13
|
+
it "replaces new lines with spaces" do
|
14
|
+
Pedantic.fix("foo\nbar").should == 'foo bar'
|
15
|
+
end
|
16
|
+
end
|
17
|
+
end
|
data/spec/spec_helper.rb
ADDED
metadata
ADDED
@@ -0,0 +1,104 @@
|
|
1
|
+
--- !ruby/object:Gem::Specification
|
2
|
+
name: pedantic
|
3
|
+
version: !ruby/object:Gem::Version
|
4
|
+
version: 0.1.0
|
5
|
+
platform: ruby
|
6
|
+
authors:
|
7
|
+
- Pat Allan
|
8
|
+
autorequire:
|
9
|
+
bindir: bin
|
10
|
+
cert_chain: []
|
11
|
+
|
12
|
+
date: 2010-02-08 00:00:00 +08:00
|
13
|
+
default_executable:
|
14
|
+
dependencies:
|
15
|
+
- !ruby/object:Gem::Dependency
|
16
|
+
name: stemmer
|
17
|
+
type: :runtime
|
18
|
+
version_requirement:
|
19
|
+
version_requirements: !ruby/object:Gem::Requirement
|
20
|
+
requirements:
|
21
|
+
- - ">="
|
22
|
+
- !ruby/object:Gem::Version
|
23
|
+
version: 1.0.1
|
24
|
+
version:
|
25
|
+
- !ruby/object:Gem::Dependency
|
26
|
+
name: rspec
|
27
|
+
type: :development
|
28
|
+
version_requirement:
|
29
|
+
version_requirements: !ruby/object:Gem::Requirement
|
30
|
+
requirements:
|
31
|
+
- - ">="
|
32
|
+
- !ruby/object:Gem::Version
|
33
|
+
version: 1.2.9
|
34
|
+
version:
|
35
|
+
- !ruby/object:Gem::Dependency
|
36
|
+
name: yard
|
37
|
+
type: :development
|
38
|
+
version_requirement:
|
39
|
+
version_requirements: !ruby/object:Gem::Requirement
|
40
|
+
requirements:
|
41
|
+
- - ">="
|
42
|
+
- !ruby/object:Gem::Version
|
43
|
+
version: "0"
|
44
|
+
version:
|
45
|
+
description: Pares text down to the words that matter
|
46
|
+
email: pat@freelancing-gods.com
|
47
|
+
executables: []
|
48
|
+
|
49
|
+
extensions: []
|
50
|
+
|
51
|
+
extra_rdoc_files:
|
52
|
+
- LICENSE
|
53
|
+
- README.textile
|
54
|
+
files:
|
55
|
+
- LICENSE
|
56
|
+
- README.textile
|
57
|
+
- lib/pedantic.rb
|
58
|
+
- lib/pedantic/emoticons.rb
|
59
|
+
- lib/pedantic/emphasis.rb
|
60
|
+
- lib/pedantic/html.rb
|
61
|
+
- lib/pedantic/punctuation.rb
|
62
|
+
- lib/pedantic/stems.rb
|
63
|
+
- lib/pedantic/txt.rb
|
64
|
+
- lib/pedantic/unimportant.rb
|
65
|
+
- lib/pedantic/uris.rb
|
66
|
+
has_rdoc: true
|
67
|
+
homepage: http://github.com/freelancing-god/pedantic
|
68
|
+
licenses: []
|
69
|
+
|
70
|
+
post_install_message:
|
71
|
+
rdoc_options:
|
72
|
+
- --charset=UTF-8
|
73
|
+
require_paths:
|
74
|
+
- lib
|
75
|
+
required_ruby_version: !ruby/object:Gem::Requirement
|
76
|
+
requirements:
|
77
|
+
- - ">="
|
78
|
+
- !ruby/object:Gem::Version
|
79
|
+
version: "0"
|
80
|
+
version:
|
81
|
+
required_rubygems_version: !ruby/object:Gem::Requirement
|
82
|
+
requirements:
|
83
|
+
- - ">="
|
84
|
+
- !ruby/object:Gem::Version
|
85
|
+
version: "0"
|
86
|
+
version:
|
87
|
+
requirements: []
|
88
|
+
|
89
|
+
rubyforge_project:
|
90
|
+
rubygems_version: 1.3.5
|
91
|
+
signing_key:
|
92
|
+
specification_version: 3
|
93
|
+
summary: Text Cleaner
|
94
|
+
test_files:
|
95
|
+
- spec/pedantic/emoticons_spec.rb
|
96
|
+
- spec/pedantic/emphasis_spec.rb
|
97
|
+
- spec/pedantic/html_spec.rb
|
98
|
+
- spec/pedantic/punctuation_spec.rb
|
99
|
+
- spec/pedantic/stems_spec.rb
|
100
|
+
- spec/pedantic/txt_spec.rb
|
101
|
+
- spec/pedantic/unimportant_spec.rb
|
102
|
+
- spec/pedantic/uris_spec.rb
|
103
|
+
- spec/pedantic_spec.rb
|
104
|
+
- spec/spec_helper.rb
|