pedantic 0.1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/LICENSE +20 -0
- data/README.textile +29 -0
- data/lib/pedantic/emoticons.rb +20 -0
- data/lib/pedantic/emphasis.rb +18 -0
- data/lib/pedantic/html.rb +15 -0
- data/lib/pedantic/punctuation.rb +18 -0
- data/lib/pedantic/stems.rb +13 -0
- data/lib/pedantic/txt.rb +16 -0
- data/lib/pedantic/unimportant.rb +19 -0
- data/lib/pedantic/uris.rb +16 -0
- data/lib/pedantic.rb +51 -0
- data/spec/pedantic/emoticons_spec.rb +23 -0
- data/spec/pedantic/emphasis_spec.rb +15 -0
- data/spec/pedantic/html_spec.rb +7 -0
- data/spec/pedantic/punctuation_spec.rb +19 -0
- data/spec/pedantic/stems_spec.rb +11 -0
- data/spec/pedantic/txt_spec.rb +15 -0
- data/spec/pedantic/unimportant_spec.rb +7 -0
- data/spec/pedantic/uris_spec.rb +15 -0
- data/spec/pedantic_spec.rb +17 -0
- data/spec/spec_helper.rb +10 -0
- metadata +104 -0
data/LICENSE
ADDED
|
@@ -0,0 +1,20 @@
|
|
|
1
|
+
Copyright (c) 2009 Pat Allan
|
|
2
|
+
|
|
3
|
+
Permission is hereby granted, free of charge, to any person obtaining
|
|
4
|
+
a copy of this software and associated documentation files (the
|
|
5
|
+
"Software"), to deal in the Software without restriction, including
|
|
6
|
+
without limitation the rights to use, copy, modify, merge, publish,
|
|
7
|
+
distribute, sublicense, and/or sell copies of the Software, and to
|
|
8
|
+
permit persons to whom the Software is furnished to do so, subject to
|
|
9
|
+
the following conditions:
|
|
10
|
+
|
|
11
|
+
The above copyright notice and this permission notice shall be
|
|
12
|
+
included in all copies or substantial portions of the Software.
|
|
13
|
+
|
|
14
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
|
|
15
|
+
EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
|
|
16
|
+
MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
|
|
17
|
+
NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
|
|
18
|
+
LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
|
|
19
|
+
OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
|
|
20
|
+
WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
|
data/README.textile
ADDED
|
@@ -0,0 +1,29 @@
|
|
|
1
|
+
h1. Pedantic
|
|
2
|
+
|
|
3
|
+
Pedantic cleans strings of text - stripping out unimportant words and URLs, fixing typos, replacing symbols (like emoticons) with real words, and running the results through a stemmer.
|
|
4
|
+
|
|
5
|
+
In short - it gives you reliable text to process (but not read).
|
|
6
|
+
|
|
7
|
+
And if the name didn't give it away, yes this library is opinionated.
|
|
8
|
+
|
|
9
|
+
h2. Installation
|
|
10
|
+
|
|
11
|
+
Grab the gem.
|
|
12
|
+
|
|
13
|
+
<pre><code>gem install pedantic</code></pre>
|
|
14
|
+
|
|
15
|
+
h2. Usage
|
|
16
|
+
|
|
17
|
+
<pre><code>Pedantic.fix('my messy string ;)') #=> 'messi string joke'</code></pre>
|
|
18
|
+
|
|
19
|
+
Note that the stemmer generates imperfect words, but it is reasonably reliable and constant in the output, so you can work with those assumptions in the output.
|
|
20
|
+
|
|
21
|
+
Also - this library is a work in progress - currently I've aimed for a relatively useful but extremely basic implementation. If you look through the code, you'll see there's few typos and emoticons handled. It's easy enough to extend, though - so please, fork, patch and send a pull request.
|
|
22
|
+
|
|
23
|
+
h2. Contributing
|
|
24
|
+
|
|
25
|
+
Fork and patch as you see fit - and please send me a pull request if you think it's useful for others. Don't forget to write specs first, and don't mess with the version numbers please (or at least: only do so in a different branch).
|
|
26
|
+
|
|
27
|
+
h2. Copyright
|
|
28
|
+
|
|
29
|
+
Copyright (c) 2010 "Pat Allan":http://freelancing-gods.com, but released under an open licence. Go for your life.
|
|
@@ -0,0 +1,20 @@
|
|
|
1
|
+
module Pedantic::Emoticons
|
|
2
|
+
def self.included(base)
|
|
3
|
+
base.processors :replace_emoticons
|
|
4
|
+
end
|
|
5
|
+
|
|
6
|
+
def replace_emoticons(string)
|
|
7
|
+
{
|
|
8
|
+
/(^|\s)\:\)(\s|$)/ => ' smile ',
|
|
9
|
+
/(^|\s)\:\((\s|$)/ => ' sad ',
|
|
10
|
+
/(^|\s)\:D(\s|$)/ => ' happy ',
|
|
11
|
+
/(^|\s)\:S(\s|$)/ => ' unsure ',
|
|
12
|
+
/(^|\s)\:s(\s|$)/ => ' unsure ',
|
|
13
|
+
/(^|\s)\;\)(\s|$)/ => ' joke '
|
|
14
|
+
}.each do |pattern, replacement|
|
|
15
|
+
string.gsub!(pattern, replacement)
|
|
16
|
+
end
|
|
17
|
+
|
|
18
|
+
string
|
|
19
|
+
end
|
|
20
|
+
end
|
|
@@ -0,0 +1,18 @@
|
|
|
1
|
+
module Pedantic::Emphasis
|
|
2
|
+
def self.included(base)
|
|
3
|
+
base.processors :fix_emphasis
|
|
4
|
+
end
|
|
5
|
+
|
|
6
|
+
def fix_emphasis(string)
|
|
7
|
+
{
|
|
8
|
+
/\bso+\b/ => 'so',
|
|
9
|
+
/\bre+a+ll+y\b/i => 'really',
|
|
10
|
+
/\boka+y\b/i => 'okay',
|
|
11
|
+
/\boo+h/i => 'ooh'
|
|
12
|
+
}.each { |pattern, replacement|
|
|
13
|
+
string.gsub!(pattern, replacement)
|
|
14
|
+
}
|
|
15
|
+
|
|
16
|
+
string
|
|
17
|
+
end
|
|
18
|
+
end
|
|
@@ -0,0 +1,18 @@
|
|
|
1
|
+
module Pedantic::Punctuation
|
|
2
|
+
def self.included(base)
|
|
3
|
+
base.processors :replace_punctuation
|
|
4
|
+
end
|
|
5
|
+
|
|
6
|
+
def replace_punctuation(string)
|
|
7
|
+
{
|
|
8
|
+
/\bcan't\b/ => 'cannot',
|
|
9
|
+
/'/ => '',
|
|
10
|
+
/sh\*t/ => 'shit',
|
|
11
|
+
/f\*[\*c]k/ => 'fuck'
|
|
12
|
+
}.each { |pattern, replacement|
|
|
13
|
+
string.gsub!(pattern, replacement)
|
|
14
|
+
}
|
|
15
|
+
|
|
16
|
+
string
|
|
17
|
+
end
|
|
18
|
+
end
|
data/lib/pedantic/txt.rb
ADDED
|
@@ -0,0 +1,16 @@
|
|
|
1
|
+
module Pedantic::Txt
|
|
2
|
+
def self.included(base)
|
|
3
|
+
base.processors :replace_txt_words
|
|
4
|
+
end
|
|
5
|
+
|
|
6
|
+
def replace_txt_words(string)
|
|
7
|
+
{
|
|
8
|
+
/\btn?x\b/i => 'thanks',
|
|
9
|
+
/\bthnks\b/i => 'thanks'
|
|
10
|
+
}.each { |pattern, replacement|
|
|
11
|
+
string.gsub!(pattern, replacement)
|
|
12
|
+
}
|
|
13
|
+
|
|
14
|
+
string
|
|
15
|
+
end
|
|
16
|
+
end
|
|
@@ -0,0 +1,19 @@
|
|
|
1
|
+
module Pedantic::Unimportant
|
|
2
|
+
def self.included(base)
|
|
3
|
+
base.processors :remove_unimportant_words
|
|
4
|
+
end
|
|
5
|
+
|
|
6
|
+
def remove_unimportant_words(string)
|
|
7
|
+
%w(
|
|
8
|
+
a again all along also an and are as at but by came can cant couldnt did
|
|
9
|
+
didn didnt do doesntdont ever first from have her here him how i if in
|
|
10
|
+
into is isnt it itll just last least like most my new no not now of on or
|
|
11
|
+
should sinc so some th than this that the their then those to told too
|
|
12
|
+
true try until url us were when whether while with within yes you youll
|
|
13
|
+
).each { |word|
|
|
14
|
+
string.gsub!(/\b#{word}\b/, '')
|
|
15
|
+
}
|
|
16
|
+
|
|
17
|
+
string
|
|
18
|
+
end
|
|
19
|
+
end
|
|
@@ -0,0 +1,16 @@
|
|
|
1
|
+
module Pedantic::Uris
|
|
2
|
+
def self.included(base)
|
|
3
|
+
base.processors :remove_uris
|
|
4
|
+
end
|
|
5
|
+
|
|
6
|
+
def remove_uris(string)
|
|
7
|
+
[
|
|
8
|
+
/https?:\/\/\S+/,
|
|
9
|
+
/([\w\.])+\.(com|co|net|org|info)(\.\w+)?/
|
|
10
|
+
].each { |pattern|
|
|
11
|
+
string.gsub!(pattern, '')
|
|
12
|
+
}
|
|
13
|
+
|
|
14
|
+
string
|
|
15
|
+
end
|
|
16
|
+
end
|
data/lib/pedantic.rb
ADDED
|
@@ -0,0 +1,51 @@
|
|
|
1
|
+
class Pedantic
|
|
2
|
+
@@processors = []
|
|
3
|
+
|
|
4
|
+
def self.fix(string)
|
|
5
|
+
Pedantic.new(string).to_s
|
|
6
|
+
end
|
|
7
|
+
|
|
8
|
+
def self.processors(*methods)
|
|
9
|
+
@@processors += methods
|
|
10
|
+
end
|
|
11
|
+
|
|
12
|
+
def initialize(string)
|
|
13
|
+
@original = string
|
|
14
|
+
end
|
|
15
|
+
|
|
16
|
+
def to_s
|
|
17
|
+
process
|
|
18
|
+
end
|
|
19
|
+
|
|
20
|
+
private
|
|
21
|
+
|
|
22
|
+
def process
|
|
23
|
+
@current = @original.clone
|
|
24
|
+
|
|
25
|
+
@@processors.each do |processor|
|
|
26
|
+
@current = send(processor, @current)
|
|
27
|
+
end
|
|
28
|
+
|
|
29
|
+
@current.gsub(/\s+/, ' ').strip
|
|
30
|
+
end
|
|
31
|
+
end
|
|
32
|
+
|
|
33
|
+
require 'pedantic/emphasis'
|
|
34
|
+
require 'pedantic/emoticons'
|
|
35
|
+
require 'pedantic/html'
|
|
36
|
+
require 'pedantic/punctuation'
|
|
37
|
+
require 'pedantic/stems'
|
|
38
|
+
require 'pedantic/txt'
|
|
39
|
+
require 'pedantic/unimportant'
|
|
40
|
+
require 'pedantic/uris'
|
|
41
|
+
|
|
42
|
+
class Pedantic
|
|
43
|
+
include Pedantic::Uris
|
|
44
|
+
include Pedantic::Html
|
|
45
|
+
include Pedantic::Emoticons
|
|
46
|
+
include Pedantic::Txt
|
|
47
|
+
include Pedantic::Punctuation
|
|
48
|
+
include Pedantic::Emphasis
|
|
49
|
+
include Pedantic::Unimportant
|
|
50
|
+
include Pedantic::Stems
|
|
51
|
+
end
|
|
@@ -0,0 +1,23 @@
|
|
|
1
|
+
require 'spec/spec_helper'
|
|
2
|
+
|
|
3
|
+
describe Pedantic::Emoticons do
|
|
4
|
+
it "replaces :) with smile" do
|
|
5
|
+
Pedantic.fix('foo :)').should == 'foo smile'
|
|
6
|
+
end
|
|
7
|
+
|
|
8
|
+
it "replaces :( with sad" do
|
|
9
|
+
Pedantic.fix('foo :(').should == 'foo sad'
|
|
10
|
+
end
|
|
11
|
+
|
|
12
|
+
it "replaces :D with happy" do
|
|
13
|
+
Pedantic.fix('foo :D').should == 'foo happi'
|
|
14
|
+
end
|
|
15
|
+
|
|
16
|
+
it "replaces :S and :s with unsure" do
|
|
17
|
+
Pedantic.fix('foo :S :s').should == 'foo unsur unsur'
|
|
18
|
+
end
|
|
19
|
+
|
|
20
|
+
it "replaces ;) with joke" do
|
|
21
|
+
Pedantic.fix('foo ;)').should == 'foo joke'
|
|
22
|
+
end
|
|
23
|
+
end
|
|
@@ -0,0 +1,15 @@
|
|
|
1
|
+
require 'spec/spec_helper'
|
|
2
|
+
|
|
3
|
+
describe Pedantic::Emphasis do
|
|
4
|
+
it "replaces reeeaalllly with realy" do
|
|
5
|
+
Pedantic.fix('Reaaally reeeaalllly').should == 'realli realli'
|
|
6
|
+
end
|
|
7
|
+
|
|
8
|
+
it "replaces okaaaay with okay" do
|
|
9
|
+
Pedantic.fix('Okaaaaay okaaay').should == 'okai okai'
|
|
10
|
+
end
|
|
11
|
+
|
|
12
|
+
it "replaces oooooh with ooh" do
|
|
13
|
+
Pedantic.fix('OoOoOoh').should == 'ooh'
|
|
14
|
+
end
|
|
15
|
+
end
|
|
@@ -0,0 +1,19 @@
|
|
|
1
|
+
require 'spec/spec_helper'
|
|
2
|
+
|
|
3
|
+
describe Pedantic::Punctuation do
|
|
4
|
+
it "replaces can't with cannot" do
|
|
5
|
+
Pedantic.fix("can't foo").should == 'cannot foo'
|
|
6
|
+
end
|
|
7
|
+
|
|
8
|
+
it "removes other apostrophes" do
|
|
9
|
+
Pedantic.fix("isn't it foo").should == 'foo'
|
|
10
|
+
end
|
|
11
|
+
|
|
12
|
+
it "replaces sh*t with shit" do
|
|
13
|
+
Pedantic.fix('sh*t').should == 'shit'
|
|
14
|
+
end
|
|
15
|
+
|
|
16
|
+
it "replaces f**k and f*ck with fuck" do
|
|
17
|
+
Pedantic.fix('f**k f*ck').should == 'fuck fuck'
|
|
18
|
+
end
|
|
19
|
+
end
|
|
@@ -0,0 +1,11 @@
|
|
|
1
|
+
require 'spec/spec_helper'
|
|
2
|
+
|
|
3
|
+
describe Pedantic::Stems do
|
|
4
|
+
it "reduces words to their core stems" do
|
|
5
|
+
Pedantic.fix('wandering').should == 'wander'
|
|
6
|
+
end
|
|
7
|
+
|
|
8
|
+
it "reduces all words to their core stems" do
|
|
9
|
+
Pedantic.fix('wandering and running').should == 'wander run'
|
|
10
|
+
end
|
|
11
|
+
end
|
|
@@ -0,0 +1,15 @@
|
|
|
1
|
+
require 'spec/spec_helper'
|
|
2
|
+
|
|
3
|
+
describe Pedantic::Txt do
|
|
4
|
+
it "replaces tx with thank" do
|
|
5
|
+
Pedantic.fix('tx').should == 'thank'
|
|
6
|
+
end
|
|
7
|
+
|
|
8
|
+
it "replaces tnx with thank" do
|
|
9
|
+
Pedantic.fix('tnx').should == 'thank'
|
|
10
|
+
end
|
|
11
|
+
|
|
12
|
+
it "replaces thnks with thank" do
|
|
13
|
+
Pedantic.fix('thnks').should == 'thank'
|
|
14
|
+
end
|
|
15
|
+
end
|
|
@@ -0,0 +1,15 @@
|
|
|
1
|
+
require 'spec/spec_helper'
|
|
2
|
+
|
|
3
|
+
describe Pedantic::Uris do
|
|
4
|
+
it "removes explicit web addresses" do
|
|
5
|
+
Pedantic.fix('foo http://bar.com bar').should == 'foo bar'
|
|
6
|
+
end
|
|
7
|
+
|
|
8
|
+
it "removes explicit https addresses" do
|
|
9
|
+
Pedantic.fix('foo https://bar.com bar').should == 'foo bar'
|
|
10
|
+
end
|
|
11
|
+
|
|
12
|
+
it "removes implicit web addresses" do
|
|
13
|
+
Pedantic.fix('foo domain.com bar').should == 'foo bar'
|
|
14
|
+
end
|
|
15
|
+
end
|
|
@@ -0,0 +1,17 @@
|
|
|
1
|
+
require 'spec/spec_helper'
|
|
2
|
+
|
|
3
|
+
describe Pedantic do
|
|
4
|
+
describe '.fix' do
|
|
5
|
+
it "removes leading and trailing spaces" do
|
|
6
|
+
Pedantic.fix(' foo ').should == 'foo'
|
|
7
|
+
end
|
|
8
|
+
|
|
9
|
+
it "replaces multiple spaces with a single space" do
|
|
10
|
+
Pedantic.fix('foo bar').should == 'foo bar'
|
|
11
|
+
end
|
|
12
|
+
|
|
13
|
+
it "replaces new lines with spaces" do
|
|
14
|
+
Pedantic.fix("foo\nbar").should == 'foo bar'
|
|
15
|
+
end
|
|
16
|
+
end
|
|
17
|
+
end
|
data/spec/spec_helper.rb
ADDED
metadata
ADDED
|
@@ -0,0 +1,104 @@
|
|
|
1
|
+
--- !ruby/object:Gem::Specification
|
|
2
|
+
name: pedantic
|
|
3
|
+
version: !ruby/object:Gem::Version
|
|
4
|
+
version: 0.1.0
|
|
5
|
+
platform: ruby
|
|
6
|
+
authors:
|
|
7
|
+
- Pat Allan
|
|
8
|
+
autorequire:
|
|
9
|
+
bindir: bin
|
|
10
|
+
cert_chain: []
|
|
11
|
+
|
|
12
|
+
date: 2010-02-08 00:00:00 +08:00
|
|
13
|
+
default_executable:
|
|
14
|
+
dependencies:
|
|
15
|
+
- !ruby/object:Gem::Dependency
|
|
16
|
+
name: stemmer
|
|
17
|
+
type: :runtime
|
|
18
|
+
version_requirement:
|
|
19
|
+
version_requirements: !ruby/object:Gem::Requirement
|
|
20
|
+
requirements:
|
|
21
|
+
- - ">="
|
|
22
|
+
- !ruby/object:Gem::Version
|
|
23
|
+
version: 1.0.1
|
|
24
|
+
version:
|
|
25
|
+
- !ruby/object:Gem::Dependency
|
|
26
|
+
name: rspec
|
|
27
|
+
type: :development
|
|
28
|
+
version_requirement:
|
|
29
|
+
version_requirements: !ruby/object:Gem::Requirement
|
|
30
|
+
requirements:
|
|
31
|
+
- - ">="
|
|
32
|
+
- !ruby/object:Gem::Version
|
|
33
|
+
version: 1.2.9
|
|
34
|
+
version:
|
|
35
|
+
- !ruby/object:Gem::Dependency
|
|
36
|
+
name: yard
|
|
37
|
+
type: :development
|
|
38
|
+
version_requirement:
|
|
39
|
+
version_requirements: !ruby/object:Gem::Requirement
|
|
40
|
+
requirements:
|
|
41
|
+
- - ">="
|
|
42
|
+
- !ruby/object:Gem::Version
|
|
43
|
+
version: "0"
|
|
44
|
+
version:
|
|
45
|
+
description: Pares text down to the words that matter
|
|
46
|
+
email: pat@freelancing-gods.com
|
|
47
|
+
executables: []
|
|
48
|
+
|
|
49
|
+
extensions: []
|
|
50
|
+
|
|
51
|
+
extra_rdoc_files:
|
|
52
|
+
- LICENSE
|
|
53
|
+
- README.textile
|
|
54
|
+
files:
|
|
55
|
+
- LICENSE
|
|
56
|
+
- README.textile
|
|
57
|
+
- lib/pedantic.rb
|
|
58
|
+
- lib/pedantic/emoticons.rb
|
|
59
|
+
- lib/pedantic/emphasis.rb
|
|
60
|
+
- lib/pedantic/html.rb
|
|
61
|
+
- lib/pedantic/punctuation.rb
|
|
62
|
+
- lib/pedantic/stems.rb
|
|
63
|
+
- lib/pedantic/txt.rb
|
|
64
|
+
- lib/pedantic/unimportant.rb
|
|
65
|
+
- lib/pedantic/uris.rb
|
|
66
|
+
has_rdoc: true
|
|
67
|
+
homepage: http://github.com/freelancing-god/pedantic
|
|
68
|
+
licenses: []
|
|
69
|
+
|
|
70
|
+
post_install_message:
|
|
71
|
+
rdoc_options:
|
|
72
|
+
- --charset=UTF-8
|
|
73
|
+
require_paths:
|
|
74
|
+
- lib
|
|
75
|
+
required_ruby_version: !ruby/object:Gem::Requirement
|
|
76
|
+
requirements:
|
|
77
|
+
- - ">="
|
|
78
|
+
- !ruby/object:Gem::Version
|
|
79
|
+
version: "0"
|
|
80
|
+
version:
|
|
81
|
+
required_rubygems_version: !ruby/object:Gem::Requirement
|
|
82
|
+
requirements:
|
|
83
|
+
- - ">="
|
|
84
|
+
- !ruby/object:Gem::Version
|
|
85
|
+
version: "0"
|
|
86
|
+
version:
|
|
87
|
+
requirements: []
|
|
88
|
+
|
|
89
|
+
rubyforge_project:
|
|
90
|
+
rubygems_version: 1.3.5
|
|
91
|
+
signing_key:
|
|
92
|
+
specification_version: 3
|
|
93
|
+
summary: Text Cleaner
|
|
94
|
+
test_files:
|
|
95
|
+
- spec/pedantic/emoticons_spec.rb
|
|
96
|
+
- spec/pedantic/emphasis_spec.rb
|
|
97
|
+
- spec/pedantic/html_spec.rb
|
|
98
|
+
- spec/pedantic/punctuation_spec.rb
|
|
99
|
+
- spec/pedantic/stems_spec.rb
|
|
100
|
+
- spec/pedantic/txt_spec.rb
|
|
101
|
+
- spec/pedantic/unimportant_spec.rb
|
|
102
|
+
- spec/pedantic/uris_spec.rb
|
|
103
|
+
- spec/pedantic_spec.rb
|
|
104
|
+
- spec/spec_helper.rb
|