pedantic 0.1.0

Sign up to get free protection for your applications and to get access to all the features.
data/LICENSE ADDED
@@ -0,0 +1,20 @@
1
+ Copyright (c) 2009 Pat Allan
2
+
3
+ Permission is hereby granted, free of charge, to any person obtaining
4
+ a copy of this software and associated documentation files (the
5
+ "Software"), to deal in the Software without restriction, including
6
+ without limitation the rights to use, copy, modify, merge, publish,
7
+ distribute, sublicense, and/or sell copies of the Software, and to
8
+ permit persons to whom the Software is furnished to do so, subject to
9
+ the following conditions:
10
+
11
+ The above copyright notice and this permission notice shall be
12
+ included in all copies or substantial portions of the Software.
13
+
14
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
15
+ EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
16
+ MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
17
+ NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
18
+ LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
19
+ OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
20
+ WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
data/README.textile ADDED
@@ -0,0 +1,29 @@
1
+ h1. Pedantic
2
+
3
+ Pedantic cleans strings of text - stripping out unimportant words and URLs, fixing typos, replacing symbols (like emoticons) with real words, and running the results through a stemmer.
4
+
5
+ In short - it gives you reliable text to process (but not read).
6
+
7
+ And if the name didn't give it away, yes this library is opinionated.
8
+
9
+ h2. Installation
10
+
11
+ Grab the gem.
12
+
13
+ <pre><code>gem install pedantic</code></pre>
14
+
15
+ h2. Usage
16
+
17
+ <pre><code>Pedantic.fix('my messy string ;)') #=> 'messi string joke'</code></pre>
18
+
19
+ Note that the stemmer generates imperfect words, but it is reasonably reliable and constant in the output, so you can work with those assumptions in the output.
20
+
21
+ Also - this library is a work in progress - currently I've aimed for a relatively useful but extremely basic implementation. If you look through the code, you'll see there's few typos and emoticons handled. It's easy enough to extend, though - so please, fork, patch and send a pull request.
22
+
23
+ h2. Contributing
24
+
25
+ Fork and patch as you see fit - and please send me a pull request if you think it's useful for others. Don't forget to write specs first, and don't mess with the version numbers please (or at least: only do so in a different branch).
26
+
27
+ h2. Copyright
28
+
29
+ Copyright (c) 2010 "Pat Allan":http://freelancing-gods.com, but released under an open licence. Go for your life.
@@ -0,0 +1,20 @@
1
+ module Pedantic::Emoticons
2
+ def self.included(base)
3
+ base.processors :replace_emoticons
4
+ end
5
+
6
+ def replace_emoticons(string)
7
+ {
8
+ /(^|\s)\:\)(\s|$)/ => ' smile ',
9
+ /(^|\s)\:\((\s|$)/ => ' sad ',
10
+ /(^|\s)\:D(\s|$)/ => ' happy ',
11
+ /(^|\s)\:S(\s|$)/ => ' unsure ',
12
+ /(^|\s)\:s(\s|$)/ => ' unsure ',
13
+ /(^|\s)\;\)(\s|$)/ => ' joke '
14
+ }.each do |pattern, replacement|
15
+ string.gsub!(pattern, replacement)
16
+ end
17
+
18
+ string
19
+ end
20
+ end
@@ -0,0 +1,18 @@
1
+ module Pedantic::Emphasis
2
+ def self.included(base)
3
+ base.processors :fix_emphasis
4
+ end
5
+
6
+ def fix_emphasis(string)
7
+ {
8
+ /\bso+\b/ => 'so',
9
+ /\bre+a+ll+y\b/i => 'really',
10
+ /\boka+y\b/i => 'okay',
11
+ /\boo+h/i => 'ooh'
12
+ }.each { |pattern, replacement|
13
+ string.gsub!(pattern, replacement)
14
+ }
15
+
16
+ string
17
+ end
18
+ end
@@ -0,0 +1,15 @@
1
+ module Pedantic::Html
2
+ def self.included(base)
3
+ base.processors :remove_html
4
+ end
5
+
6
+ def remove_html(string)
7
+ [
8
+ /&quot;/
9
+ ].each { |pattern|
10
+ string.gsub!(pattern, '')
11
+ }
12
+
13
+ string
14
+ end
15
+ end
@@ -0,0 +1,18 @@
1
+ module Pedantic::Punctuation
2
+ def self.included(base)
3
+ base.processors :replace_punctuation
4
+ end
5
+
6
+ def replace_punctuation(string)
7
+ {
8
+ /\bcan't\b/ => 'cannot',
9
+ /'/ => '',
10
+ /sh\*t/ => 'shit',
11
+ /f\*[\*c]k/ => 'fuck'
12
+ }.each { |pattern, replacement|
13
+ string.gsub!(pattern, replacement)
14
+ }
15
+
16
+ string
17
+ end
18
+ end
@@ -0,0 +1,13 @@
1
+ require 'stemmer'
2
+
3
+ module Pedantic::Stems
4
+ def self.included(base)
5
+ base.processors :reduce_to_stems
6
+ end
7
+
8
+ def reduce_to_stems(string)
9
+ string.split(/\W+/).collect { |word|
10
+ word.stem
11
+ }.join(' ')
12
+ end
13
+ end
@@ -0,0 +1,16 @@
1
+ module Pedantic::Txt
2
+ def self.included(base)
3
+ base.processors :replace_txt_words
4
+ end
5
+
6
+ def replace_txt_words(string)
7
+ {
8
+ /\btn?x\b/i => 'thanks',
9
+ /\bthnks\b/i => 'thanks'
10
+ }.each { |pattern, replacement|
11
+ string.gsub!(pattern, replacement)
12
+ }
13
+
14
+ string
15
+ end
16
+ end
@@ -0,0 +1,19 @@
1
+ module Pedantic::Unimportant
2
+ def self.included(base)
3
+ base.processors :remove_unimportant_words
4
+ end
5
+
6
+ def remove_unimportant_words(string)
7
+ %w(
8
+ a again all along also an and are as at but by came can cant couldnt did
9
+ didn didnt do doesntdont ever first from have her here him how i if in
10
+ into is isnt it itll just last least like most my new no not now of on or
11
+ should sinc so some th than this that the their then those to told too
12
+ true try until url us were when whether while with within yes you youll
13
+ ).each { |word|
14
+ string.gsub!(/\b#{word}\b/, '')
15
+ }
16
+
17
+ string
18
+ end
19
+ end
@@ -0,0 +1,16 @@
1
+ module Pedantic::Uris
2
+ def self.included(base)
3
+ base.processors :remove_uris
4
+ end
5
+
6
+ def remove_uris(string)
7
+ [
8
+ /https?:\/\/\S+/,
9
+ /([\w\.])+\.(com|co|net|org|info)(\.\w+)?/
10
+ ].each { |pattern|
11
+ string.gsub!(pattern, '')
12
+ }
13
+
14
+ string
15
+ end
16
+ end
data/lib/pedantic.rb ADDED
@@ -0,0 +1,51 @@
1
+ class Pedantic
2
+ @@processors = []
3
+
4
+ def self.fix(string)
5
+ Pedantic.new(string).to_s
6
+ end
7
+
8
+ def self.processors(*methods)
9
+ @@processors += methods
10
+ end
11
+
12
+ def initialize(string)
13
+ @original = string
14
+ end
15
+
16
+ def to_s
17
+ process
18
+ end
19
+
20
+ private
21
+
22
+ def process
23
+ @current = @original.clone
24
+
25
+ @@processors.each do |processor|
26
+ @current = send(processor, @current)
27
+ end
28
+
29
+ @current.gsub(/\s+/, ' ').strip
30
+ end
31
+ end
32
+
33
+ require 'pedantic/emphasis'
34
+ require 'pedantic/emoticons'
35
+ require 'pedantic/html'
36
+ require 'pedantic/punctuation'
37
+ require 'pedantic/stems'
38
+ require 'pedantic/txt'
39
+ require 'pedantic/unimportant'
40
+ require 'pedantic/uris'
41
+
42
+ class Pedantic
43
+ include Pedantic::Uris
44
+ include Pedantic::Html
45
+ include Pedantic::Emoticons
46
+ include Pedantic::Txt
47
+ include Pedantic::Punctuation
48
+ include Pedantic::Emphasis
49
+ include Pedantic::Unimportant
50
+ include Pedantic::Stems
51
+ end
@@ -0,0 +1,23 @@
1
+ require 'spec/spec_helper'
2
+
3
+ describe Pedantic::Emoticons do
4
+ it "replaces :) with smile" do
5
+ Pedantic.fix('foo :)').should == 'foo smile'
6
+ end
7
+
8
+ it "replaces :( with sad" do
9
+ Pedantic.fix('foo :(').should == 'foo sad'
10
+ end
11
+
12
+ it "replaces :D with happy" do
13
+ Pedantic.fix('foo :D').should == 'foo happi'
14
+ end
15
+
16
+ it "replaces :S and :s with unsure" do
17
+ Pedantic.fix('foo :S :s').should == 'foo unsur unsur'
18
+ end
19
+
20
+ it "replaces ;) with joke" do
21
+ Pedantic.fix('foo ;)').should == 'foo joke'
22
+ end
23
+ end
@@ -0,0 +1,15 @@
1
+ require 'spec/spec_helper'
2
+
3
+ describe Pedantic::Emphasis do
4
+ it "replaces reeeaalllly with realy" do
5
+ Pedantic.fix('Reaaally reeeaalllly').should == 'realli realli'
6
+ end
7
+
8
+ it "replaces okaaaay with okay" do
9
+ Pedantic.fix('Okaaaaay okaaay').should == 'okai okai'
10
+ end
11
+
12
+ it "replaces oooooh with ooh" do
13
+ Pedantic.fix('OoOoOoh').should == 'ooh'
14
+ end
15
+ end
@@ -0,0 +1,7 @@
1
+ require 'spec/spec_helper'
2
+
3
+ describe Pedantic::Html do
4
+ it "removes HTML quotes" do
5
+ Pedantic.fix('&quot;air quotes&quot;').should == 'air quot'
6
+ end
7
+ end
@@ -0,0 +1,19 @@
1
+ require 'spec/spec_helper'
2
+
3
+ describe Pedantic::Punctuation do
4
+ it "replaces can't with cannot" do
5
+ Pedantic.fix("can't foo").should == 'cannot foo'
6
+ end
7
+
8
+ it "removes other apostrophes" do
9
+ Pedantic.fix("isn't it foo").should == 'foo'
10
+ end
11
+
12
+ it "replaces sh*t with shit" do
13
+ Pedantic.fix('sh*t').should == 'shit'
14
+ end
15
+
16
+ it "replaces f**k and f*ck with fuck" do
17
+ Pedantic.fix('f**k f*ck').should == 'fuck fuck'
18
+ end
19
+ end
@@ -0,0 +1,11 @@
1
+ require 'spec/spec_helper'
2
+
3
+ describe Pedantic::Stems do
4
+ it "reduces words to their core stems" do
5
+ Pedantic.fix('wandering').should == 'wander'
6
+ end
7
+
8
+ it "reduces all words to their core stems" do
9
+ Pedantic.fix('wandering and running').should == 'wander run'
10
+ end
11
+ end
@@ -0,0 +1,15 @@
1
+ require 'spec/spec_helper'
2
+
3
+ describe Pedantic::Txt do
4
+ it "replaces tx with thank" do
5
+ Pedantic.fix('tx').should == 'thank'
6
+ end
7
+
8
+ it "replaces tnx with thank" do
9
+ Pedantic.fix('tnx').should == 'thank'
10
+ end
11
+
12
+ it "replaces thnks with thank" do
13
+ Pedantic.fix('thnks').should == 'thank'
14
+ end
15
+ end
@@ -0,0 +1,7 @@
1
+ require 'spec/spec_helper'
2
+
3
+ describe Pedantic::Unimportant do
4
+ it "removes words like 'the', 'a', 'and'" do
5
+ Pedantic.fix('a foo and the bar').should == 'foo bar'
6
+ end
7
+ end
@@ -0,0 +1,15 @@
1
+ require 'spec/spec_helper'
2
+
3
+ describe Pedantic::Uris do
4
+ it "removes explicit web addresses" do
5
+ Pedantic.fix('foo http://bar.com bar').should == 'foo bar'
6
+ end
7
+
8
+ it "removes explicit https addresses" do
9
+ Pedantic.fix('foo https://bar.com bar').should == 'foo bar'
10
+ end
11
+
12
+ it "removes implicit web addresses" do
13
+ Pedantic.fix('foo domain.com bar').should == 'foo bar'
14
+ end
15
+ end
@@ -0,0 +1,17 @@
1
+ require 'spec/spec_helper'
2
+
3
+ describe Pedantic do
4
+ describe '.fix' do
5
+ it "removes leading and trailing spaces" do
6
+ Pedantic.fix(' foo ').should == 'foo'
7
+ end
8
+
9
+ it "replaces multiple spaces with a single space" do
10
+ Pedantic.fix('foo bar').should == 'foo bar'
11
+ end
12
+
13
+ it "replaces new lines with spaces" do
14
+ Pedantic.fix("foo\nbar").should == 'foo bar'
15
+ end
16
+ end
17
+ end
@@ -0,0 +1,10 @@
1
+ $LOAD_PATH.unshift(File.dirname(__FILE__))
2
+ $LOAD_PATH.unshift(File.join(File.dirname(__FILE__), '..', 'lib'))
3
+
4
+ require 'pedantic'
5
+ require 'spec'
6
+ require 'spec/autorun'
7
+
8
+ Spec::Runner.configure do |config|
9
+ #
10
+ end
metadata ADDED
@@ -0,0 +1,104 @@
1
+ --- !ruby/object:Gem::Specification
2
+ name: pedantic
3
+ version: !ruby/object:Gem::Version
4
+ version: 0.1.0
5
+ platform: ruby
6
+ authors:
7
+ - Pat Allan
8
+ autorequire:
9
+ bindir: bin
10
+ cert_chain: []
11
+
12
+ date: 2010-02-08 00:00:00 +08:00
13
+ default_executable:
14
+ dependencies:
15
+ - !ruby/object:Gem::Dependency
16
+ name: stemmer
17
+ type: :runtime
18
+ version_requirement:
19
+ version_requirements: !ruby/object:Gem::Requirement
20
+ requirements:
21
+ - - ">="
22
+ - !ruby/object:Gem::Version
23
+ version: 1.0.1
24
+ version:
25
+ - !ruby/object:Gem::Dependency
26
+ name: rspec
27
+ type: :development
28
+ version_requirement:
29
+ version_requirements: !ruby/object:Gem::Requirement
30
+ requirements:
31
+ - - ">="
32
+ - !ruby/object:Gem::Version
33
+ version: 1.2.9
34
+ version:
35
+ - !ruby/object:Gem::Dependency
36
+ name: yard
37
+ type: :development
38
+ version_requirement:
39
+ version_requirements: !ruby/object:Gem::Requirement
40
+ requirements:
41
+ - - ">="
42
+ - !ruby/object:Gem::Version
43
+ version: "0"
44
+ version:
45
+ description: Pares text down to the words that matter
46
+ email: pat@freelancing-gods.com
47
+ executables: []
48
+
49
+ extensions: []
50
+
51
+ extra_rdoc_files:
52
+ - LICENSE
53
+ - README.textile
54
+ files:
55
+ - LICENSE
56
+ - README.textile
57
+ - lib/pedantic.rb
58
+ - lib/pedantic/emoticons.rb
59
+ - lib/pedantic/emphasis.rb
60
+ - lib/pedantic/html.rb
61
+ - lib/pedantic/punctuation.rb
62
+ - lib/pedantic/stems.rb
63
+ - lib/pedantic/txt.rb
64
+ - lib/pedantic/unimportant.rb
65
+ - lib/pedantic/uris.rb
66
+ has_rdoc: true
67
+ homepage: http://github.com/freelancing-god/pedantic
68
+ licenses: []
69
+
70
+ post_install_message:
71
+ rdoc_options:
72
+ - --charset=UTF-8
73
+ require_paths:
74
+ - lib
75
+ required_ruby_version: !ruby/object:Gem::Requirement
76
+ requirements:
77
+ - - ">="
78
+ - !ruby/object:Gem::Version
79
+ version: "0"
80
+ version:
81
+ required_rubygems_version: !ruby/object:Gem::Requirement
82
+ requirements:
83
+ - - ">="
84
+ - !ruby/object:Gem::Version
85
+ version: "0"
86
+ version:
87
+ requirements: []
88
+
89
+ rubyforge_project:
90
+ rubygems_version: 1.3.5
91
+ signing_key:
92
+ specification_version: 3
93
+ summary: Text Cleaner
94
+ test_files:
95
+ - spec/pedantic/emoticons_spec.rb
96
+ - spec/pedantic/emphasis_spec.rb
97
+ - spec/pedantic/html_spec.rb
98
+ - spec/pedantic/punctuation_spec.rb
99
+ - spec/pedantic/stems_spec.rb
100
+ - spec/pedantic/txt_spec.rb
101
+ - spec/pedantic/unimportant_spec.rb
102
+ - spec/pedantic/uris_spec.rb
103
+ - spec/pedantic_spec.rb
104
+ - spec/spec_helper.rb