stringfu 0.0.3 → 0.0.4
Sign up to get free protection for your applications and to get access to all the features.
- data/Gemfile +2 -0
- data/Gemfile.lock +16 -0
- data/README.markdown +33 -0
- data/lib/stringfu/version.rb +1 -1
- data/lib/stringfu.rb +26 -1
- metadata +2 -1
data/Gemfile
CHANGED
data/Gemfile.lock
ADDED
data/README.markdown
CHANGED
@@ -1,3 +1,36 @@
|
|
1
1
|
# StringFu
|
2
2
|
|
3
3
|
StringFu are some simple methods to manipulate strings in order for them to be cleaned up for Natural Language Processing (NLP).
|
4
|
+
|
5
|
+
## Install StringFu
|
6
|
+
|
7
|
+
gem install stringfu
|
8
|
+
|
9
|
+
## Usage
|
10
|
+
|
11
|
+
wtf = 'Charlie the Unicorn is a potty mouth. He\'ll say things like, @!@#% !@$%$[@#$^!)'
|
12
|
+
|
13
|
+
punc_strip will strip away all standard punctation.
|
14
|
+
|
15
|
+
wtf.punc_strip # => "Charlie the Unicorn is a potty mouth He'll say things like "
|
16
|
+
|
17
|
+
punc_gsub will replace punctionation with spaces if given no arguments
|
18
|
+
|
19
|
+
wtf.punc_gsub # => "Charlie the Unicorn is a potty mouth He'll say things like "
|
20
|
+
|
21
|
+
punc_gsub can also take arguments
|
22
|
+
|
23
|
+
wtf.punc_gsub "?" # => "Charlie the Unicorn is a potty mouth? He'll say things like? ????? ????????????"
|
24
|
+
|
25
|
+
ngrams will generate ngrams for any string and returns an array of numbers corresponding to the ngrams
|
26
|
+
|
27
|
+
ftw = "I choose Whoppie Goldberg for the Win!"
|
28
|
+
ftw.ngrams # => [1, 2, 3, 4, 5, 6, 7]
|
29
|
+
|
30
|
+
ftw.unigrams # => ["I", "choose", "Whoppie", "Goldberg", "for", "the", "Win!"]
|
31
|
+
ftw.bigrams # => ["I choose", "choose Whoppie", "Whoppie Goldberg", "Goldberg for", "for the", "the Win!"]
|
32
|
+
ftw.trigrams # => ["I choose Whoppie", "choose Whoppie Goldberg", "Whoppie Goldberg for", "Goldberg for the", "for the Win!"]
|
33
|
+
ftw._4grams # => ["I choose Whoppie Goldberg", "choose Whoppie Goldberg for", "Whoppie Goldberg for the", "Goldberg for the Win!"]
|
34
|
+
|
35
|
+
ftw.ngrams 3..6 # => [3, 4, 5, 6]
|
36
|
+
ftw.ngrams 2, 7, 3..4 # => [2, 3, 4, 7]
|
data/lib/stringfu/version.rb
CHANGED
data/lib/stringfu.rb
CHANGED
@@ -1,4 +1,8 @@
|
|
1
|
-
require
|
1
|
+
require 'rubygems'
|
2
|
+
require 'bundler/setup'
|
3
|
+
|
4
|
+
require 'stringfu/version'
|
5
|
+
require 'uea-stemmer'
|
2
6
|
|
3
7
|
STOP = ["a", "able", "about", "across", "after", "all", "almost", "also", "am", "among", "an", "and", "any", "are", "as", "at", "be", "because", "been", "but", "by", "can", "cannot", "could", "dear", "did", "do", "does", "either", "else", "ever", "every", "for", "from", "get", "got", "had", "has", "have", "he", "her", "hers", "him", "his", "how", "however", "i", "if", "in", "into", "is", "it", "its", "just", "least", "let", "like", "likely", "may", "me", "might", "most", "must", "my", "neither", "no", "nor", "not", "of", "off", "often", "on", "only", "or", "other", "our", "own", "rather", "said", "say", "says", "she", "should", "since", "so", "some", "than", "that", "the", "their", "them", "then", "there", "these", "they", "this", "tis", "to", "too", "twas", "us", "wants", "was", "we", "were", "what", "when", "where", "which", "while", "who", "whom", "why", "will", "with", "would", "yet", "you", "your"]
|
4
8
|
|
@@ -45,6 +49,7 @@ module Stringfu
|
|
45
49
|
args.each do |num|
|
46
50
|
prefix = latinfy num
|
47
51
|
self.instance_variable_set "@#{prefix}grams", []
|
52
|
+
|
48
53
|
rounds = wordphrase.length - num + 1
|
49
54
|
rounds.times do |iter|
|
50
55
|
self.instance_variable_get("@#{prefix}grams") << wordphrase[iter...(iter+num)].join(" ")
|
@@ -52,10 +57,30 @@ module Stringfu
|
|
52
57
|
# Calls :attr_accessor to add new instance variable
|
53
58
|
self.class.__send__(:attr_accessor, "#{prefix}grams".to_sym)
|
54
59
|
self.instance_variable_get("@#{prefix}grams")
|
60
|
+
|
61
|
+
self.class.__send__(:alias_method, "_#{num}grams", "#{prefix}grams") if num <= 3
|
62
|
+
self.class.__send__(:alias_method, "_#{num}grams=", "#{prefix}grams=") if num <= 3
|
55
63
|
end
|
56
64
|
args
|
57
65
|
end
|
58
66
|
|
67
|
+
def stem
|
68
|
+
words = self.punc_strip.normalize.split
|
69
|
+
|
70
|
+
stemmer = UEAStemmer.new
|
71
|
+
words = words.map do |word|
|
72
|
+
stem = stemmer.stem word
|
73
|
+
end
|
74
|
+
end
|
75
|
+
|
76
|
+
def normalize
|
77
|
+
self.downcase.squeeze(" ").strip
|
78
|
+
end
|
79
|
+
|
80
|
+
def normalize!
|
81
|
+
replace(self.downcase.squeeze(" ").strip)
|
82
|
+
end
|
83
|
+
|
59
84
|
private
|
60
85
|
def latinfy num
|
61
86
|
prefix = {1 => "uni", 2 => "bi", 3 => "tri"}
|
metadata
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: stringfu
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.0.
|
4
|
+
version: 0.0.4
|
5
5
|
prerelease:
|
6
6
|
platform: ruby
|
7
7
|
authors:
|
@@ -20,6 +20,7 @@ extra_rdoc_files: []
|
|
20
20
|
files:
|
21
21
|
- .gitignore
|
22
22
|
- Gemfile
|
23
|
+
- Gemfile.lock
|
23
24
|
- README.markdown
|
24
25
|
- Rakefile
|
25
26
|
- lib/stringfu.rb
|