pedantic 0.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
data/LICENSE ADDED
@@ -0,0 +1,20 @@
1
+ Copyright (c) 2009 Pat Allan
2
+
3
+ Permission is hereby granted, free of charge, to any person obtaining
4
+ a copy of this software and associated documentation files (the
5
+ "Software"), to deal in the Software without restriction, including
6
+ without limitation the rights to use, copy, modify, merge, publish,
7
+ distribute, sublicense, and/or sell copies of the Software, and to
8
+ permit persons to whom the Software is furnished to do so, subject to
9
+ the following conditions:
10
+
11
+ The above copyright notice and this permission notice shall be
12
+ included in all copies or substantial portions of the Software.
13
+
14
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
15
+ EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
16
+ MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
17
+ NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
18
+ LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
19
+ OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
20
+ WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
data/README.textile ADDED
@@ -0,0 +1,29 @@
1
+ h1. Pedantic
2
+
3
+ Pedantic cleans strings of text - stripping out unimportant words and URLs, fixing typos, replacing symbols (like emoticons) with real words, and running the results through a stemmer.
4
+
5
+ In short - it gives you reliable text to process (but not read).
6
+
7
+ And if the name didn't give it away, yes this library is opinionated.
8
+
9
+ h2. Installation
10
+
11
+ Grab the gem.
12
+
13
+ <pre><code>gem install pedantic</code></pre>
14
+
15
+ h2. Usage
16
+
17
+ <pre><code>Pedantic.fix('my messy string ;)') #=> 'messi string joke'</code></pre>
18
+
19
+ Note that the stemmer generates imperfect words, but it is reasonably reliable and constant in the output, so you can work with those assumptions in the output.
20
+
21
+ Also - this library is a work in progress - currently I've aimed for a relatively useful but extremely basic implementation. If you look through the code, you'll see there's few typos and emoticons handled. It's easy enough to extend, though - so please, fork, patch and send a pull request.
22
+
23
+ h2. Contributing
24
+
25
+ Fork and patch as you see fit - and please send me a pull request if you think it's useful for others. Don't forget to write specs first, and don't mess with the version numbers please (or at least: only do so in a different branch).
26
+
27
+ h2. Copyright
28
+
29
+ Copyright (c) 2010 "Pat Allan":http://freelancing-gods.com, but released under an open licence. Go for your life.
@@ -0,0 +1,20 @@
1
+ module Pedantic::Emoticons
2
+ def self.included(base)
3
+ base.processors :replace_emoticons
4
+ end
5
+
6
+ def replace_emoticons(string)
7
+ {
8
+ /(^|\s)\:\)(\s|$)/ => ' smile ',
9
+ /(^|\s)\:\((\s|$)/ => ' sad ',
10
+ /(^|\s)\:D(\s|$)/ => ' happy ',
11
+ /(^|\s)\:S(\s|$)/ => ' unsure ',
12
+ /(^|\s)\:s(\s|$)/ => ' unsure ',
13
+ /(^|\s)\;\)(\s|$)/ => ' joke '
14
+ }.each do |pattern, replacement|
15
+ string.gsub!(pattern, replacement)
16
+ end
17
+
18
+ string
19
+ end
20
+ end
@@ -0,0 +1,18 @@
1
+ module Pedantic::Emphasis
2
+ def self.included(base)
3
+ base.processors :fix_emphasis
4
+ end
5
+
6
+ def fix_emphasis(string)
7
+ {
8
+ /\bso+\b/ => 'so',
9
+ /\bre+a+ll+y\b/i => 'really',
10
+ /\boka+y\b/i => 'okay',
11
+ /\boo+h/i => 'ooh'
12
+ }.each { |pattern, replacement|
13
+ string.gsub!(pattern, replacement)
14
+ }
15
+
16
+ string
17
+ end
18
+ end
@@ -0,0 +1,15 @@
1
+ module Pedantic::Html
2
+ def self.included(base)
3
+ base.processors :remove_html
4
+ end
5
+
6
+ def remove_html(string)
7
+ [
8
+ /&quot;/
9
+ ].each { |pattern|
10
+ string.gsub!(pattern, '')
11
+ }
12
+
13
+ string
14
+ end
15
+ end
@@ -0,0 +1,18 @@
1
+ module Pedantic::Punctuation
2
+ def self.included(base)
3
+ base.processors :replace_punctuation
4
+ end
5
+
6
+ def replace_punctuation(string)
7
+ {
8
+ /\bcan't\b/ => 'cannot',
9
+ /'/ => '',
10
+ /sh\*t/ => 'shit',
11
+ /f\*[\*c]k/ => 'fuck'
12
+ }.each { |pattern, replacement|
13
+ string.gsub!(pattern, replacement)
14
+ }
15
+
16
+ string
17
+ end
18
+ end
@@ -0,0 +1,13 @@
1
+ require 'stemmer'
2
+
3
+ module Pedantic::Stems
4
+ def self.included(base)
5
+ base.processors :reduce_to_stems
6
+ end
7
+
8
+ def reduce_to_stems(string)
9
+ string.split(/\W+/).collect { |word|
10
+ word.stem
11
+ }.join(' ')
12
+ end
13
+ end
@@ -0,0 +1,16 @@
1
+ module Pedantic::Txt
2
+ def self.included(base)
3
+ base.processors :replace_txt_words
4
+ end
5
+
6
+ def replace_txt_words(string)
7
+ {
8
+ /\btn?x\b/i => 'thanks',
9
+ /\bthnks\b/i => 'thanks'
10
+ }.each { |pattern, replacement|
11
+ string.gsub!(pattern, replacement)
12
+ }
13
+
14
+ string
15
+ end
16
+ end
@@ -0,0 +1,19 @@
1
+ module Pedantic::Unimportant
2
+ def self.included(base)
3
+ base.processors :remove_unimportant_words
4
+ end
5
+
6
+ def remove_unimportant_words(string)
7
+ %w(
8
+ a again all along also an and are as at but by came can cant couldnt did
9
+ didn didnt do doesntdont ever first from have her here him how i if in
10
+ into is isnt it itll just last least like most my new no not now of on or
11
+ should sinc so some th than this that the their then those to told too
12
+ true try until url us were when whether while with within yes you youll
13
+ ).each { |word|
14
+ string.gsub!(/\b#{word}\b/, '')
15
+ }
16
+
17
+ string
18
+ end
19
+ end
@@ -0,0 +1,16 @@
1
+ module Pedantic::Uris
2
+ def self.included(base)
3
+ base.processors :remove_uris
4
+ end
5
+
6
+ def remove_uris(string)
7
+ [
8
+ /https?:\/\/\S+/,
9
+ /([\w\.])+\.(com|co|net|org|info)(\.\w+)?/
10
+ ].each { |pattern|
11
+ string.gsub!(pattern, '')
12
+ }
13
+
14
+ string
15
+ end
16
+ end
data/lib/pedantic.rb ADDED
@@ -0,0 +1,51 @@
1
+ class Pedantic
2
+ @@processors = []
3
+
4
+ def self.fix(string)
5
+ Pedantic.new(string).to_s
6
+ end
7
+
8
+ def self.processors(*methods)
9
+ @@processors += methods
10
+ end
11
+
12
+ def initialize(string)
13
+ @original = string
14
+ end
15
+
16
+ def to_s
17
+ process
18
+ end
19
+
20
+ private
21
+
22
+ def process
23
+ @current = @original.clone
24
+
25
+ @@processors.each do |processor|
26
+ @current = send(processor, @current)
27
+ end
28
+
29
+ @current.gsub(/\s+/, ' ').strip
30
+ end
31
+ end
32
+
33
+ require 'pedantic/emphasis'
34
+ require 'pedantic/emoticons'
35
+ require 'pedantic/html'
36
+ require 'pedantic/punctuation'
37
+ require 'pedantic/stems'
38
+ require 'pedantic/txt'
39
+ require 'pedantic/unimportant'
40
+ require 'pedantic/uris'
41
+
42
+ class Pedantic
43
+ include Pedantic::Uris
44
+ include Pedantic::Html
45
+ include Pedantic::Emoticons
46
+ include Pedantic::Txt
47
+ include Pedantic::Punctuation
48
+ include Pedantic::Emphasis
49
+ include Pedantic::Unimportant
50
+ include Pedantic::Stems
51
+ end
@@ -0,0 +1,23 @@
1
+ require 'spec/spec_helper'
2
+
3
+ describe Pedantic::Emoticons do
4
+ it "replaces :) with smile" do
5
+ Pedantic.fix('foo :)').should == 'foo smile'
6
+ end
7
+
8
+ it "replaces :( with sad" do
9
+ Pedantic.fix('foo :(').should == 'foo sad'
10
+ end
11
+
12
+ it "replaces :D with happy" do
13
+ Pedantic.fix('foo :D').should == 'foo happi'
14
+ end
15
+
16
+ it "replaces :S and :s with unsure" do
17
+ Pedantic.fix('foo :S :s').should == 'foo unsur unsur'
18
+ end
19
+
20
+ it "replaces ;) with joke" do
21
+ Pedantic.fix('foo ;)').should == 'foo joke'
22
+ end
23
+ end
@@ -0,0 +1,15 @@
1
+ require 'spec/spec_helper'
2
+
3
+ describe Pedantic::Emphasis do
4
+ it "replaces reeeaalllly with realy" do
5
+ Pedantic.fix('Reaaally reeeaalllly').should == 'realli realli'
6
+ end
7
+
8
+ it "replaces okaaaay with okay" do
9
+ Pedantic.fix('Okaaaaay okaaay').should == 'okai okai'
10
+ end
11
+
12
+ it "replaces oooooh with ooh" do
13
+ Pedantic.fix('OoOoOoh').should == 'ooh'
14
+ end
15
+ end
@@ -0,0 +1,7 @@
1
+ require 'spec/spec_helper'
2
+
3
+ describe Pedantic::Html do
4
+ it "removes HTML quotes" do
5
+ Pedantic.fix('&quot;air quotes&quot;').should == 'air quot'
6
+ end
7
+ end
@@ -0,0 +1,19 @@
1
+ require 'spec/spec_helper'
2
+
3
+ describe Pedantic::Punctuation do
4
+ it "replaces can't with cannot" do
5
+ Pedantic.fix("can't foo").should == 'cannot foo'
6
+ end
7
+
8
+ it "removes other apostrophes" do
9
+ Pedantic.fix("isn't it foo").should == 'foo'
10
+ end
11
+
12
+ it "replaces sh*t with shit" do
13
+ Pedantic.fix('sh*t').should == 'shit'
14
+ end
15
+
16
+ it "replaces f**k and f*ck with fuck" do
17
+ Pedantic.fix('f**k f*ck').should == 'fuck fuck'
18
+ end
19
+ end
@@ -0,0 +1,11 @@
1
+ require 'spec/spec_helper'
2
+
3
+ describe Pedantic::Stems do
4
+ it "reduces words to their core stems" do
5
+ Pedantic.fix('wandering').should == 'wander'
6
+ end
7
+
8
+ it "reduces all words to their core stems" do
9
+ Pedantic.fix('wandering and running').should == 'wander run'
10
+ end
11
+ end
@@ -0,0 +1,15 @@
1
+ require 'spec/spec_helper'
2
+
3
+ describe Pedantic::Txt do
4
+ it "replaces tx with thank" do
5
+ Pedantic.fix('tx').should == 'thank'
6
+ end
7
+
8
+ it "replaces tnx with thank" do
9
+ Pedantic.fix('tnx').should == 'thank'
10
+ end
11
+
12
+ it "replaces thnks with thank" do
13
+ Pedantic.fix('thnks').should == 'thank'
14
+ end
15
+ end
@@ -0,0 +1,7 @@
1
+ require 'spec/spec_helper'
2
+
3
+ describe Pedantic::Unimportant do
4
+ it "removes words like 'the', 'a', 'and'" do
5
+ Pedantic.fix('a foo and the bar').should == 'foo bar'
6
+ end
7
+ end
@@ -0,0 +1,15 @@
1
+ require 'spec/spec_helper'
2
+
3
+ describe Pedantic::Uris do
4
+ it "removes explicit web addresses" do
5
+ Pedantic.fix('foo http://bar.com bar').should == 'foo bar'
6
+ end
7
+
8
+ it "removes explicit https addresses" do
9
+ Pedantic.fix('foo https://bar.com bar').should == 'foo bar'
10
+ end
11
+
12
+ it "removes implicit web addresses" do
13
+ Pedantic.fix('foo domain.com bar').should == 'foo bar'
14
+ end
15
+ end
@@ -0,0 +1,17 @@
1
+ require 'spec/spec_helper'
2
+
3
+ describe Pedantic do
4
+ describe '.fix' do
5
+ it "removes leading and trailing spaces" do
6
+ Pedantic.fix(' foo ').should == 'foo'
7
+ end
8
+
9
+ it "replaces multiple spaces with a single space" do
10
+ Pedantic.fix('foo bar').should == 'foo bar'
11
+ end
12
+
13
+ it "replaces new lines with spaces" do
14
+ Pedantic.fix("foo\nbar").should == 'foo bar'
15
+ end
16
+ end
17
+ end
@@ -0,0 +1,10 @@
1
+ $LOAD_PATH.unshift(File.dirname(__FILE__))
2
+ $LOAD_PATH.unshift(File.join(File.dirname(__FILE__), '..', 'lib'))
3
+
4
+ require 'pedantic'
5
+ require 'spec'
6
+ require 'spec/autorun'
7
+
8
+ Spec::Runner.configure do |config|
9
+ #
10
+ end
metadata ADDED
@@ -0,0 +1,104 @@
1
+ --- !ruby/object:Gem::Specification
2
+ name: pedantic
3
+ version: !ruby/object:Gem::Version
4
+ version: 0.1.0
5
+ platform: ruby
6
+ authors:
7
+ - Pat Allan
8
+ autorequire:
9
+ bindir: bin
10
+ cert_chain: []
11
+
12
+ date: 2010-02-08 00:00:00 +08:00
13
+ default_executable:
14
+ dependencies:
15
+ - !ruby/object:Gem::Dependency
16
+ name: stemmer
17
+ type: :runtime
18
+ version_requirement:
19
+ version_requirements: !ruby/object:Gem::Requirement
20
+ requirements:
21
+ - - ">="
22
+ - !ruby/object:Gem::Version
23
+ version: 1.0.1
24
+ version:
25
+ - !ruby/object:Gem::Dependency
26
+ name: rspec
27
+ type: :development
28
+ version_requirement:
29
+ version_requirements: !ruby/object:Gem::Requirement
30
+ requirements:
31
+ - - ">="
32
+ - !ruby/object:Gem::Version
33
+ version: 1.2.9
34
+ version:
35
+ - !ruby/object:Gem::Dependency
36
+ name: yard
37
+ type: :development
38
+ version_requirement:
39
+ version_requirements: !ruby/object:Gem::Requirement
40
+ requirements:
41
+ - - ">="
42
+ - !ruby/object:Gem::Version
43
+ version: "0"
44
+ version:
45
+ description: Pares text down to the words that matter
46
+ email: pat@freelancing-gods.com
47
+ executables: []
48
+
49
+ extensions: []
50
+
51
+ extra_rdoc_files:
52
+ - LICENSE
53
+ - README.textile
54
+ files:
55
+ - LICENSE
56
+ - README.textile
57
+ - lib/pedantic.rb
58
+ - lib/pedantic/emoticons.rb
59
+ - lib/pedantic/emphasis.rb
60
+ - lib/pedantic/html.rb
61
+ - lib/pedantic/punctuation.rb
62
+ - lib/pedantic/stems.rb
63
+ - lib/pedantic/txt.rb
64
+ - lib/pedantic/unimportant.rb
65
+ - lib/pedantic/uris.rb
66
+ has_rdoc: true
67
+ homepage: http://github.com/freelancing-god/pedantic
68
+ licenses: []
69
+
70
+ post_install_message:
71
+ rdoc_options:
72
+ - --charset=UTF-8
73
+ require_paths:
74
+ - lib
75
+ required_ruby_version: !ruby/object:Gem::Requirement
76
+ requirements:
77
+ - - ">="
78
+ - !ruby/object:Gem::Version
79
+ version: "0"
80
+ version:
81
+ required_rubygems_version: !ruby/object:Gem::Requirement
82
+ requirements:
83
+ - - ">="
84
+ - !ruby/object:Gem::Version
85
+ version: "0"
86
+ version:
87
+ requirements: []
88
+
89
+ rubyforge_project:
90
+ rubygems_version: 1.3.5
91
+ signing_key:
92
+ specification_version: 3
93
+ summary: Text Cleaner
94
+ test_files:
95
+ - spec/pedantic/emoticons_spec.rb
96
+ - spec/pedantic/emphasis_spec.rb
97
+ - spec/pedantic/html_spec.rb
98
+ - spec/pedantic/punctuation_spec.rb
99
+ - spec/pedantic/stems_spec.rb
100
+ - spec/pedantic/txt_spec.rb
101
+ - spec/pedantic/unimportant_spec.rb
102
+ - spec/pedantic/uris_spec.rb
103
+ - spec/pedantic_spec.rb
104
+ - spec/spec_helper.rb