bayes_on_redis 0.1.3 → 0.1.6
Sign up to get free protection for your applications and to get access to all the features.
- data/datasets/stopwords.txt +38 -0
- data/lib/bayes_on_redis.rb +31 -3
- metadata +5 -4
@@ -0,0 +1,38 @@
|
|
1
|
+
a able about above abroad according accordingly across actually adj after afterwards again against ago ahead
|
2
|
+
ain't all allow allows almost alone along alongside already also although always am amid amidst among amongst
|
3
|
+
an and another any anybody anyhow anyone anything anyway anyways anywhere apart appear appreciate appropriate
|
4
|
+
are aren't around as a's aside ask asking associated at available away awfully b back backward backwards be
|
5
|
+
became because become becomes becoming been before beforehand begin behind being believe below beside besides
|
6
|
+
best better between beyond both brief but by c came can cannot cant can't caption cause causes certain certainly
|
7
|
+
changes clearly c'mon co co. com come comes concerning consequently consider considering contain containing
|
8
|
+
contains corresponding could couldn't course c's currently d dare daren't definitely described despite did didn't
|
9
|
+
different directly do does doesn't doing done don't down downwards during e each edu eg eight eighty either else
|
10
|
+
elsewhere end ending enough entirely especially et etc even ever evermore every everybody everyone everything
|
11
|
+
everywhere ex exactly example except f fairly far farther few fewer fifth first five followed following follows
|
12
|
+
for forever former formerly forth forward found four from further furthermore g get gets getting given gives go
|
13
|
+
goes going gone got gotten greetings h had hadn't half happens hardly has hasn't have haven't having he he'd he'll
|
14
|
+
hello help hence her here hereafter hereby herein here's hereupon hers herself he's hi him himself his hither
|
15
|
+
hopefully how howbeit however hundred i i'd ie if ignored i'll i'm immediate in inasmuch inc inc. indeed indicate
|
16
|
+
indicated indicates inner inside insofar instead into inward is isn't it it'd it'll its it's itself i've j just k
|
17
|
+
keep keeps kept know known knows l last lately later latter latterly least less lest let let's like liked likely
|
18
|
+
likewise little look looking looks low lower ltd m made mainly make makes many may maybe mayn't me mean meantime
|
19
|
+
meanwhile merely might mightn't mine minus miss more moreover most mostly mr mrs much must mustn't my myself n name
|
20
|
+
namely nd near nearly necessary need needn't needs neither never neverf neverless nevertheless new next nine ninety
|
21
|
+
no nobody non none nonetheless noone no-one nor normally not nothing notwithstanding novel now nowhere o obviously
|
22
|
+
of off often oh ok okay old on once one ones one's only onto opposite or other others otherwise ought oughtn't our
|
23
|
+
ours ourselves out outside over overall own p particular particularly past per perhaps placed please plus possible
|
24
|
+
presumably probably provided provides q que quite qv r rather rd re really reasonably recent recently regarding
|
25
|
+
regardless regards relatively respectively right round s said same saw say saying says second secondly see seeing
|
26
|
+
seem seemed seeming seems seen self selves sensible sent serious seriously seven several shall shan't she she'd
|
27
|
+
she'll she's should shouldn't since six so some somebody someday somehow someone something sometime sometimes
|
28
|
+
somewhat somewhere soon sorry specified specify specifying still sub such sup sure t take taken taking tell tends
|
29
|
+
th than thank thanks thanx that that'll thats that's that've the their theirs them themselves then thence there
|
30
|
+
thereafter thereby there'd therefore therein there'll there're theres there's thereupon there've these they they'd
|
31
|
+
they'll they're they've thing things think third thirty this thorough thoroughly those though three through
|
32
|
+
throughout thru thus till to together too took toward towards tried tries truly try trying t's twice two u un under
|
33
|
+
underneath undoing unfortunately unless unlike unlikely until unto up upon upwards us use used useful uses using
|
34
|
+
usually v value various versus very via viz vs w want wants was wasn't way we we'd welcome well we'll went were we're
|
35
|
+
weren't we've what whatever what'll what's what've when whence whenever where whereafter whereas whereby wherein
|
36
|
+
where's whereupon wherever whether which whichever while whilst whither who who'd whoever whole who'll whom whomever
|
37
|
+
who's whose why will willing wish with within without wonder won't would wouldn't x y yes yet you you'd you'll your
|
38
|
+
you're yours yourself yourselves you've z zero successful greatest began including being all for close but
|
data/lib/bayes_on_redis.rb
CHANGED
@@ -3,10 +3,14 @@ require "redis"
|
|
3
3
|
|
4
4
|
class BayesOnRedis
|
5
5
|
CATEGORIES_KEY = "BayesOnRedis:categories"
|
6
|
-
|
6
|
+
ONE_OR_TWO_WORDS_RE = /\b\w{1,2}\b/mi
|
7
|
+
NON_ALPHANUMERIC_AND_NON_DOT_RE = /[^\w\.]/mi
|
8
|
+
|
9
|
+
attr_reader :redis, :stopwords
|
7
10
|
|
8
11
|
def initialize(options)
|
9
12
|
@redis = Redis.new(:host => options[:redis_host], :port => options[:redis_port], :db => options[:redis_db])
|
13
|
+
@stopwords = Stopword.new
|
10
14
|
end
|
11
15
|
|
12
16
|
def flushdb
|
@@ -69,11 +73,35 @@ class BayesOnRedis
|
|
69
73
|
end
|
70
74
|
|
71
75
|
# Incoming text is always downcased
|
72
|
-
def count_occurance(text)
|
76
|
+
def count_occurance(text='')
|
73
77
|
raise "input must be instance of String" unless text.is_a?(String)
|
74
78
|
|
75
|
-
text.
|
79
|
+
text_chunks = text.downcase.gsub(ONE_OR_TWO_WORDS_RE, '').gsub(NON_ALPHANUMERIC_AND_NON_DOT_RE, ' ').gsub(@stopwords.to_re, '').split
|
80
|
+
text_chunks.inject(Hash.new(0)) do |container, word|
|
76
81
|
container[word] += 1; container
|
77
82
|
end
|
78
83
|
end
|
84
|
+
|
85
|
+
def remove_stopwords
|
86
|
+
@redis.smembers(CATEGORIES_KEY).each do |category|
|
87
|
+
@stopwords.to_a.each do |stopword|
|
88
|
+
@redis.hdel(redis_category_key(category), stopword)
|
89
|
+
end
|
90
|
+
end
|
91
|
+
end
|
92
|
+
end
|
93
|
+
|
94
|
+
|
95
|
+
class Stopword
|
96
|
+
def initialize
|
97
|
+
@stopwords = File.read(File.join("datasets", "stopwords.txt")).split
|
98
|
+
end
|
99
|
+
|
100
|
+
def to_a
|
101
|
+
@stopwords
|
102
|
+
end
|
103
|
+
|
104
|
+
def to_re
|
105
|
+
@to_re ||= /\b(#{@stopwords.join('|')})\b/mi
|
106
|
+
end
|
79
107
|
end
|
metadata
CHANGED
@@ -1,13 +1,13 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: bayes_on_redis
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
hash:
|
4
|
+
hash: 23
|
5
5
|
prerelease: false
|
6
6
|
segments:
|
7
7
|
- 0
|
8
8
|
- 1
|
9
|
-
-
|
10
|
-
version: 0.1.
|
9
|
+
- 6
|
10
|
+
version: 0.1.6
|
11
11
|
platform: ruby
|
12
12
|
authors:
|
13
13
|
- Didip Kerabat
|
@@ -15,7 +15,7 @@ autorequire:
|
|
15
15
|
bindir: bin
|
16
16
|
cert_chain: []
|
17
17
|
|
18
|
-
date: 2010-11-
|
18
|
+
date: 2010-11-26 00:00:00 -08:00
|
19
19
|
default_executable:
|
20
20
|
dependencies: []
|
21
21
|
|
@@ -30,6 +30,7 @@ extra_rdoc_files: []
|
|
30
30
|
files:
|
31
31
|
- README.markdown
|
32
32
|
- lib/bayes_on_redis.rb
|
33
|
+
- datasets/stopwords.txt
|
33
34
|
has_rdoc: true
|
34
35
|
homepage: https://github.com/didip/bayes_on_redis
|
35
36
|
licenses: []
|