stringfu 0.0.3 → 0.0.4
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/Gemfile +2 -0
- data/Gemfile.lock +16 -0
- data/README.markdown +33 -0
- data/lib/stringfu/version.rb +1 -1
- data/lib/stringfu.rb +26 -1
- metadata +2 -1
data/Gemfile
CHANGED
data/Gemfile.lock
ADDED
data/README.markdown
CHANGED
@@ -1,3 +1,36 @@
|
|
1
1
|
# StringFu
|
2
2
|
|
3
3
|
StringFu are some simple methods to manipulate strings in order for them to be cleaned up for Natural Language Processing (NLP).
|
4
|
+
|
5
|
+
## Install StringFu
|
6
|
+
|
7
|
+
gem install stringfu
|
8
|
+
|
9
|
+
## Usage
|
10
|
+
|
11
|
+
wtf = 'Charlie the Unicorn is a potty mouth. He\'ll say things like, @!@#% !@$%$[@#$^!)'
|
12
|
+
|
13
|
+
punc_strip will strip away all standard punctation.
|
14
|
+
|
15
|
+
wtf.punc_strip # => "Charlie the Unicorn is a potty mouth He'll say things like "
|
16
|
+
|
17
|
+
punc_gsub will replace punctionation with spaces if given no arguments
|
18
|
+
|
19
|
+
wtf.punc_gsub # => "Charlie the Unicorn is a potty mouth He'll say things like "
|
20
|
+
|
21
|
+
punc_gsub can also take arguments
|
22
|
+
|
23
|
+
wtf.punc_gsub "?" # => "Charlie the Unicorn is a potty mouth? He'll say things like? ????? ????????????"
|
24
|
+
|
25
|
+
ngrams will generate ngrams for any string and returns an array of numbers corresponding to the ngrams
|
26
|
+
|
27
|
+
ftw = "I choose Whoppie Goldberg for the Win!"
|
28
|
+
ftw.ngrams # => [1, 2, 3, 4, 5, 6, 7]
|
29
|
+
|
30
|
+
ftw.unigrams # => ["I", "choose", "Whoppie", "Goldberg", "for", "the", "Win!"]
|
31
|
+
ftw.bigrams # => ["I choose", "choose Whoppie", "Whoppie Goldberg", "Goldberg for", "for the", "the Win!"]
|
32
|
+
ftw.trigrams # => ["I choose Whoppie", "choose Whoppie Goldberg", "Whoppie Goldberg for", "Goldberg for the", "for the Win!"]
|
33
|
+
ftw._4grams # => ["I choose Whoppie Goldberg", "choose Whoppie Goldberg for", "Whoppie Goldberg for the", "Goldberg for the Win!"]
|
34
|
+
|
35
|
+
ftw.ngrams 3..6 # => [3, 4, 5, 6]
|
36
|
+
ftw.ngrams 2, 7, 3..4 # => [2, 3, 4, 7]
|
data/lib/stringfu/version.rb
CHANGED
data/lib/stringfu.rb
CHANGED
@@ -1,4 +1,8 @@
|
|
1
|
-
require
|
1
|
+
require 'rubygems'
|
2
|
+
require 'bundler/setup'
|
3
|
+
|
4
|
+
require 'stringfu/version'
|
5
|
+
require 'uea-stemmer'
|
2
6
|
|
3
7
|
STOP = ["a", "able", "about", "across", "after", "all", "almost", "also", "am", "among", "an", "and", "any", "are", "as", "at", "be", "because", "been", "but", "by", "can", "cannot", "could", "dear", "did", "do", "does", "either", "else", "ever", "every", "for", "from", "get", "got", "had", "has", "have", "he", "her", "hers", "him", "his", "how", "however", "i", "if", "in", "into", "is", "it", "its", "just", "least", "let", "like", "likely", "may", "me", "might", "most", "must", "my", "neither", "no", "nor", "not", "of", "off", "often", "on", "only", "or", "other", "our", "own", "rather", "said", "say", "says", "she", "should", "since", "so", "some", "than", "that", "the", "their", "them", "then", "there", "these", "they", "this", "tis", "to", "too", "twas", "us", "wants", "was", "we", "were", "what", "when", "where", "which", "while", "who", "whom", "why", "will", "with", "would", "yet", "you", "your"]
|
4
8
|
|
@@ -45,6 +49,7 @@ module Stringfu
|
|
45
49
|
args.each do |num|
|
46
50
|
prefix = latinfy num
|
47
51
|
self.instance_variable_set "@#{prefix}grams", []
|
52
|
+
|
48
53
|
rounds = wordphrase.length - num + 1
|
49
54
|
rounds.times do |iter|
|
50
55
|
self.instance_variable_get("@#{prefix}grams") << wordphrase[iter...(iter+num)].join(" ")
|
@@ -52,10 +57,30 @@ module Stringfu
|
|
52
57
|
# Calls :attr_accessor to add new instance variable
|
53
58
|
self.class.__send__(:attr_accessor, "#{prefix}grams".to_sym)
|
54
59
|
self.instance_variable_get("@#{prefix}grams")
|
60
|
+
|
61
|
+
self.class.__send__(:alias_method, "_#{num}grams", "#{prefix}grams") if num <= 3
|
62
|
+
self.class.__send__(:alias_method, "_#{num}grams=", "#{prefix}grams=") if num <= 3
|
55
63
|
end
|
56
64
|
args
|
57
65
|
end
|
58
66
|
|
67
|
+
def stem
|
68
|
+
words = self.punc_strip.normalize.split
|
69
|
+
|
70
|
+
stemmer = UEAStemmer.new
|
71
|
+
words = words.map do |word|
|
72
|
+
stem = stemmer.stem word
|
73
|
+
end
|
74
|
+
end
|
75
|
+
|
76
|
+
def normalize
|
77
|
+
self.downcase.squeeze(" ").strip
|
78
|
+
end
|
79
|
+
|
80
|
+
def normalize!
|
81
|
+
replace(self.downcase.squeeze(" ").strip)
|
82
|
+
end
|
83
|
+
|
59
84
|
private
|
60
85
|
def latinfy num
|
61
86
|
prefix = {1 => "uni", 2 => "bi", 3 => "tri"}
|
metadata
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: stringfu
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.0.
|
4
|
+
version: 0.0.4
|
5
5
|
prerelease:
|
6
6
|
platform: ruby
|
7
7
|
authors:
|
@@ -20,6 +20,7 @@ extra_rdoc_files: []
|
|
20
20
|
files:
|
21
21
|
- .gitignore
|
22
22
|
- Gemfile
|
23
|
+
- Gemfile.lock
|
23
24
|
- README.markdown
|
24
25
|
- Rakefile
|
25
26
|
- lib/stringfu.rb
|