word_stats 0.0.1

Sign up to get free protection for your applications and to get access to all the features.
data/.gitignore ADDED
@@ -0,0 +1,17 @@
1
+ *.gem
2
+ *.rbc
3
+ .bundle
4
+ .config
5
+ .yardoc
6
+ Gemfile.lock
7
+ InstalledFiles
8
+ _yardoc
9
+ coverage
10
+ doc/
11
+ lib/bundler/man
12
+ pkg
13
+ rdoc
14
+ spec/reports
15
+ test/tmp
16
+ test/version_tmp
17
+ tmp
data/.rspec ADDED
@@ -0,0 +1 @@
1
+ --color
data/Gemfile ADDED
@@ -0,0 +1,4 @@
1
+ source 'https://rubygems.org'
2
+
3
+ # Specify your gem's dependencies in word_stats.gemspec
4
+ gemspec
data/LICENSE ADDED
@@ -0,0 +1,22 @@
1
+ Copyright (c) 2012 Casper
2
+
3
+ MIT License
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining
6
+ a copy of this software and associated documentation files (the
7
+ "Software"), to deal in the Software without restriction, including
8
+ without limitation the rights to use, copy, modify, merge, publish,
9
+ distribute, sublicense, and/or sell copies of the Software, and to
10
+ permit persons to whom the Software is furnished to do so, subject to
11
+ the following conditions:
12
+
13
+ The above copyright notice and this permission notice shall be
14
+ included in all copies or substantial portions of the Software.
15
+
16
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
17
+ EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
18
+ MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
19
+ NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
20
+ LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
21
+ OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
22
+ WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
data/README.md ADDED
@@ -0,0 +1,61 @@
1
+ # WordStats
2
+
3
+ WordStats provides a set of methods useful for counting character and word frequencies.
4
+
5
+ ## Installation
6
+
7
+ Add this line to your application's Gemfile:
8
+
9
+ gem 'word_stats'
10
+
11
+ And then execute:
12
+
13
+ $ bundle
14
+
15
+ Or install it yourself as:
16
+
17
+ $ gem install word_stats
18
+
19
+ ## Usage
20
+
21
+ Require the WordStats gem as follows:
22
+
23
+ require 'word_stats' # Remember to require Ruby Gems first if using Ruby 1.8
24
+
25
+ text = "The quick brown fox jumps over the lazy dog."
26
+ # Note: all strings processed by WordStats are downcased!!
27
+
28
+ WordStats provides shortcuts for single letter frequencies, bigrams and trigrams. The `WordStats::Characters.ngrams(n,text)` method can be used to find n-grams of any length. The output is a hash of the form [:word,count].
29
+
30
+ letter_frequencies = WordStats::Characters.letters(text)
31
+ letter_frequencies[:'u'] #=> 2
32
+
33
+ bigrams = WordStats::Characters.bigrams(text)
34
+ bigrams[:'th'] #=> 2
35
+
36
+ trigrams = WordStats::Characters.trigrams(text)
37
+ trigrams['qui'.to_sym] #=> 1
38
+
39
+ octocats = WordStats::Characters.ngrams(8,text)
40
+ octocats[:'The quic'] #=> 0
41
+ octocats[:'the quic'] #=> 1
42
+
43
+ Similarly, WordStats provides a method to count words and any arbitrary length sequence of words:
44
+
45
+ word_count = WordStats::Words.nwords(1,text)
46
+ word_count[:'the'] #=> 2
47
+
48
+ word_pairs = WordStats::Words.nwords(2,text)
49
+ word_pairs[:'quick brown'] #=> 1
50
+
51
+ ## Important Notes
52
+
53
+ WordStats will downcase any string that you pass into it. It also strips punctuation before processing.
54
+
55
+ ## Contributing
56
+
57
+ 1. Fork it
58
+ 2. Create your feature branch (`git checkout -b my-new-feature`)
59
+ 3. Commit your changes (`git commit -am 'Added some feature'`)
60
+ 4. Push to the branch (`git push origin my-new-feature`)
61
+ 5. Create new Pull Request
data/Rakefile ADDED
@@ -0,0 +1,2 @@
1
+ #!/usr/bin/env rake
2
+ require "bundler/gem_tasks"
@@ -0,0 +1,8 @@
1
+ class String
2
+ ##
3
+ # Removes all common forms of punctuation marks from the string.
4
+ def remove_punctuation
5
+ regex = /[\.\,\!\?\;\[\]\(\)\'\"\/\\\`]/
6
+ self.gsub(regex,'')
7
+ end
8
+ end
@@ -0,0 +1,3 @@
1
+ module WordStats
2
+ VERSION = "0.0.1"
3
+ end
data/lib/word_stats.rb ADDED
@@ -0,0 +1,53 @@
1
+ require "word_stats/version"
2
+ require "word_stats/string_extensions"
3
+
4
+ ##
5
+ # This module exposes a collection of methods available for statistical analysis of strings of text.
6
+ module WordStats
7
+
8
+ module Characters
9
+
10
+ ##
11
+ # Computes a letter frequency distribution of the input string
12
+ def self.letters(text)
13
+ ngrams(1,text)
14
+ end
15
+
16
+ ##
17
+ # Computes the bigram frequency of the input string
18
+ def self.bigrams(text)
19
+ ngrams(2,text)
20
+ end
21
+
22
+ ##
23
+ # Computes the trigram frequency of the input string
24
+ def self.trigrams(text)
25
+ ngrams(3,text)
26
+ end
27
+
28
+ ##
29
+ # Compute the N-gram frequency of the input string
30
+ def self.ngrams(n,text)
31
+ text = text.remove_punctuation.downcase
32
+ ngrams = Hash.new(0)
33
+ for i in n-1..(text.length-1)
34
+ ngrams[(text[i-(n-1)..i]).to_sym] += 1
35
+ end
36
+ ngrams
37
+ end
38
+
39
+ end
40
+
41
+ module Words
42
+ def self.nwords(n,text)
43
+ text = text.remove_punctuation.downcase
44
+ text = text.split(/\s+/)
45
+ nwords = Hash.new(0)
46
+ for i in n-1..(text.length-1)
47
+ nwords[(text[i-(n-1)..i]).join(' ').to_sym] += 1
48
+ end
49
+ nwords
50
+ end
51
+ end
52
+
53
+ end
@@ -0,0 +1,36 @@
1
+ require 'word_stats'
2
+
3
+ describe WordStats::Characters do
4
+ it "should produce a correct letter distribution" do
5
+ text = "abbcccdddd"
6
+ letters = WordStats::Characters.ngrams(1,text)
7
+ letters[:'d'].should == 4
8
+ letters[:'b'].should == 2
9
+ letters[:'e'].should == 0
10
+ end
11
+
12
+ it "should correctly identify the bigrams" do
13
+ text = "The quick brown fox jumps over the lazy dog."
14
+ bigrams = WordStats::Characters.bigrams(text)
15
+ bigrams[:'th'].should == 2
16
+ bigrams[:'he'].should == 2
17
+ bigrams[:'qu'].should == 1
18
+ bigrams[:'g.'].should == 0
19
+ end
20
+
21
+ it "should correctly identify trigrams" do
22
+ text = "Harharhar! It was just a harmless joke!"
23
+ trigrams = WordStats::Characters.trigrams(text)
24
+ trigrams[:'har'].should == 4
25
+ trigrams[:'s j'].should == 2
26
+ trigrams[:'ke!'].should == 0
27
+ end
28
+
29
+ it "should correctly identify 8-grams" do
30
+ text = "The octopus has eight legs."
31
+ octograms = WordStats::Characters.ngrams(8,text)
32
+ octograms[:' octopus'].should == 1
33
+ octograms[:'has nine'].should == 0
34
+ end
35
+
36
+ end
@@ -0,0 +1,26 @@
1
+ require 'word_stats'
2
+
3
+ describe String,"#remove_punctuation" do
4
+ it "should remove exclamation points, periods, and question marks" do
5
+ text = "Hello! My name is Casper. How may I help you?"
6
+ text.remove_punctuation.should == "Hello My name is Casper How may I help you"
7
+ end
8
+ it "should remove single quotes and double quotes" do
9
+ text = "\'Hey,\' said the \"Dude\"."
10
+ text.remove_punctuation.should == "Hey said the Dude"
11
+ end
12
+ it "should remove parenthesis and brackets" do
13
+ text = "fo[ob].ar()"
14
+ text.remove_punctuation.should == "foobar"
15
+ end
16
+ it "should remove backticks and semicolons" do
17
+ text = "Java stmts end in semicolons; Ruby does cool things like `ls`."
18
+ text.remove_punctuation.should == "Java stmts end in semicolons Ruby does cool things like ls"
19
+ end
20
+ it "should remove dashes if set to do so" do
21
+ pending "configuration preferences or hash object passed to method?"
22
+ end
23
+ it "should remove underscores if set to do so" do
24
+ pending "same as dashes"
25
+ end
26
+ end
@@ -0,0 +1,16 @@
1
+ require 'word_stats'
2
+
3
+ describe WordStats::Words do
4
+ it "should correctly count words" do
5
+ text = "How many times have I used the word times?"
6
+ words = WordStats::Words.nwords(1,text)
7
+ words[:'times'].should == 2
8
+ words[:'have'].should == 1
9
+ end
10
+ it "should correctly identify word pairs" do
11
+ text = "Sometimes I run. Sometimes I jump. Sometimes, however, I sit."
12
+ words = WordStats::Words.nwords(2,text)
13
+ words[:'sometimes i'].should == 2
14
+ words[:'never do'].should == 0
15
+ end
16
+ end
@@ -0,0 +1,17 @@
1
+ # -*- encoding: utf-8 -*-
2
+ require File.expand_path('../lib/word_stats/version', __FILE__)
3
+
4
+ Gem::Specification.new do |gem|
5
+ gem.authors = ["Casper"]
6
+ gem.email = ["cholmgreen@gmail.com"]
7
+ gem.description = "Provides a set of methods useful for analysis of plaintext"
8
+ gem.summary = "Provides a set of methods useful for analysis of plaintext"
9
+ gem.homepage = ""
10
+
11
+ gem.files = `git ls-files`.split($\)
12
+ gem.executables = gem.files.grep(%r{^bin/}).map{ |f| File.basename(f) }
13
+ gem.test_files = gem.files.grep(%r{^(test|spec|features)/})
14
+ gem.name = "word_stats"
15
+ gem.require_paths = ["lib"]
16
+ gem.version = WordStats::VERSION
17
+ end
metadata ADDED
@@ -0,0 +1,61 @@
1
+ --- !ruby/object:Gem::Specification
2
+ name: word_stats
3
+ version: !ruby/object:Gem::Version
4
+ version: 0.0.1
5
+ prerelease:
6
+ platform: ruby
7
+ authors:
8
+ - Casper
9
+ autorequire:
10
+ bindir: bin
11
+ cert_chain: []
12
+ date: 2012-08-06 00:00:00.000000000 Z
13
+ dependencies: []
14
+ description: Provides a set of methods useful for analysis of plaintext
15
+ email:
16
+ - cholmgreen@gmail.com
17
+ executables: []
18
+ extensions: []
19
+ extra_rdoc_files: []
20
+ files:
21
+ - .gitignore
22
+ - .rspec
23
+ - Gemfile
24
+ - LICENSE
25
+ - README.md
26
+ - Rakefile
27
+ - lib/word_stats.rb
28
+ - lib/word_stats/string_extensions.rb
29
+ - lib/word_stats/version.rb
30
+ - spec/characters_spec.rb
31
+ - spec/string_spec.rb
32
+ - spec/words_spec.rb
33
+ - word_stats.gemspec
34
+ homepage: ''
35
+ licenses: []
36
+ post_install_message:
37
+ rdoc_options: []
38
+ require_paths:
39
+ - lib
40
+ required_ruby_version: !ruby/object:Gem::Requirement
41
+ none: false
42
+ requirements:
43
+ - - ! '>='
44
+ - !ruby/object:Gem::Version
45
+ version: '0'
46
+ required_rubygems_version: !ruby/object:Gem::Requirement
47
+ none: false
48
+ requirements:
49
+ - - ! '>='
50
+ - !ruby/object:Gem::Version
51
+ version: '0'
52
+ requirements: []
53
+ rubyforge_project:
54
+ rubygems_version: 1.8.24
55
+ signing_key:
56
+ specification_version: 3
57
+ summary: Provides a set of methods useful for analysis of plaintext
58
+ test_files:
59
+ - spec/characters_spec.rb
60
+ - spec/string_spec.rb
61
+ - spec/words_spec.rb