bayes_on_redis 0.1.3 → 0.1.6
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/datasets/stopwords.txt +38 -0
- data/lib/bayes_on_redis.rb +31 -3
- metadata +5 -4
@@ -0,0 +1,38 @@
|
|
1
|
+
a able about above abroad according accordingly across actually adj after afterwards again against ago ahead
|
2
|
+
ain't all allow allows almost alone along alongside already also although always am amid amidst among amongst
|
3
|
+
an and another any anybody anyhow anyone anything anyway anyways anywhere apart appear appreciate appropriate
|
4
|
+
are aren't around as a's aside ask asking associated at available away awfully b back backward backwards be
|
5
|
+
became because become becomes becoming been before beforehand begin behind being believe below beside besides
|
6
|
+
best better between beyond both brief but by c came can cannot cant can't caption cause causes certain certainly
|
7
|
+
changes clearly c'mon co co. com come comes concerning consequently consider considering contain containing
|
8
|
+
contains corresponding could couldn't course c's currently d dare daren't definitely described despite did didn't
|
9
|
+
different directly do does doesn't doing done don't down downwards during e each edu eg eight eighty either else
|
10
|
+
elsewhere end ending enough entirely especially et etc even ever evermore every everybody everyone everything
|
11
|
+
everywhere ex exactly example except f fairly far farther few fewer fifth first five followed following follows
|
12
|
+
for forever former formerly forth forward found four from further furthermore g get gets getting given gives go
|
13
|
+
goes going gone got gotten greetings h had hadn't half happens hardly has hasn't have haven't having he he'd he'll
|
14
|
+
hello help hence her here hereafter hereby herein here's hereupon hers herself he's hi him himself his hither
|
15
|
+
hopefully how howbeit however hundred i i'd ie if ignored i'll i'm immediate in inasmuch inc inc. indeed indicate
|
16
|
+
indicated indicates inner inside insofar instead into inward is isn't it it'd it'll its it's itself i've j just k
|
17
|
+
keep keeps kept know known knows l last lately later latter latterly least less lest let let's like liked likely
|
18
|
+
likewise little look looking looks low lower ltd m made mainly make makes many may maybe mayn't me mean meantime
|
19
|
+
meanwhile merely might mightn't mine minus miss more moreover most mostly mr mrs much must mustn't my myself n name
|
20
|
+
namely nd near nearly necessary need needn't needs neither never neverf neverless nevertheless new next nine ninety
|
21
|
+
no nobody non none nonetheless noone no-one nor normally not nothing notwithstanding novel now nowhere o obviously
|
22
|
+
of off often oh ok okay old on once one ones one's only onto opposite or other others otherwise ought oughtn't our
|
23
|
+
ours ourselves out outside over overall own p particular particularly past per perhaps placed please plus possible
|
24
|
+
presumably probably provided provides q que quite qv r rather rd re really reasonably recent recently regarding
|
25
|
+
regardless regards relatively respectively right round s said same saw say saying says second secondly see seeing
|
26
|
+
seem seemed seeming seems seen self selves sensible sent serious seriously seven several shall shan't she she'd
|
27
|
+
she'll she's should shouldn't since six so some somebody someday somehow someone something sometime sometimes
|
28
|
+
somewhat somewhere soon sorry specified specify specifying still sub such sup sure t take taken taking tell tends
|
29
|
+
th than thank thanks thanx that that'll thats that's that've the their theirs them themselves then thence there
|
30
|
+
thereafter thereby there'd therefore therein there'll there're theres there's thereupon there've these they they'd
|
31
|
+
they'll they're they've thing things think third thirty this thorough thoroughly those though three through
|
32
|
+
throughout thru thus till to together too took toward towards tried tries truly try trying t's twice two u un under
|
33
|
+
underneath undoing unfortunately unless unlike unlikely until unto up upon upwards us use used useful uses using
|
34
|
+
usually v value various versus very via viz vs w want wants was wasn't way we we'd welcome well we'll went were we're
|
35
|
+
weren't we've what whatever what'll what's what've when whence whenever where whereafter whereas whereby wherein
|
36
|
+
where's whereupon wherever whether which whichever while whilst whither who who'd whoever whole who'll whom whomever
|
37
|
+
who's whose why will willing wish with within without wonder won't would wouldn't x y yes yet you you'd you'll your
|
38
|
+
you're yours yourself yourselves you've z zero successful greatest began including being all for close but
|
data/lib/bayes_on_redis.rb
CHANGED
@@ -3,10 +3,14 @@ require "redis"
|
|
3
3
|
|
4
4
|
class BayesOnRedis
|
5
5
|
CATEGORIES_KEY = "BayesOnRedis:categories"
|
6
|
-
|
6
|
+
ONE_OR_TWO_WORDS_RE = /\b\w{1,2}\b/mi
|
7
|
+
NON_ALPHANUMERIC_AND_NON_DOT_RE = /[^\w\.]/mi
|
8
|
+
|
9
|
+
attr_reader :redis, :stopwords
|
7
10
|
|
8
11
|
def initialize(options)
|
9
12
|
@redis = Redis.new(:host => options[:redis_host], :port => options[:redis_port], :db => options[:redis_db])
|
13
|
+
@stopwords = Stopword.new
|
10
14
|
end
|
11
15
|
|
12
16
|
def flushdb
|
@@ -69,11 +73,35 @@ class BayesOnRedis
|
|
69
73
|
end
|
70
74
|
|
71
75
|
# Incoming text is always downcased
|
72
|
-
def count_occurance(text)
|
76
|
+
def count_occurance(text='')
|
73
77
|
raise "input must be instance of String" unless text.is_a?(String)
|
74
78
|
|
75
|
-
text.
|
79
|
+
text_chunks = text.downcase.gsub(ONE_OR_TWO_WORDS_RE, '').gsub(NON_ALPHANUMERIC_AND_NON_DOT_RE, ' ').gsub(@stopwords.to_re, '').split
|
80
|
+
text_chunks.inject(Hash.new(0)) do |container, word|
|
76
81
|
container[word] += 1; container
|
77
82
|
end
|
78
83
|
end
|
84
|
+
|
85
|
+
def remove_stopwords
|
86
|
+
@redis.smembers(CATEGORIES_KEY).each do |category|
|
87
|
+
@stopwords.to_a.each do |stopword|
|
88
|
+
@redis.hdel(redis_category_key(category), stopword)
|
89
|
+
end
|
90
|
+
end
|
91
|
+
end
|
92
|
+
end
|
93
|
+
|
94
|
+
|
95
|
+
class Stopword
|
96
|
+
def initialize
|
97
|
+
@stopwords = File.read(File.join("datasets", "stopwords.txt")).split
|
98
|
+
end
|
99
|
+
|
100
|
+
def to_a
|
101
|
+
@stopwords
|
102
|
+
end
|
103
|
+
|
104
|
+
def to_re
|
105
|
+
@to_re ||= /\b(#{@stopwords.join('|')})\b/mi
|
106
|
+
end
|
79
107
|
end
|
metadata
CHANGED
@@ -1,13 +1,13 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: bayes_on_redis
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
hash:
|
4
|
+
hash: 23
|
5
5
|
prerelease: false
|
6
6
|
segments:
|
7
7
|
- 0
|
8
8
|
- 1
|
9
|
-
-
|
10
|
-
version: 0.1.
|
9
|
+
- 6
|
10
|
+
version: 0.1.6
|
11
11
|
platform: ruby
|
12
12
|
authors:
|
13
13
|
- Didip Kerabat
|
@@ -15,7 +15,7 @@ autorequire:
|
|
15
15
|
bindir: bin
|
16
16
|
cert_chain: []
|
17
17
|
|
18
|
-
date: 2010-11-
|
18
|
+
date: 2010-11-26 00:00:00 -08:00
|
19
19
|
default_executable:
|
20
20
|
dependencies: []
|
21
21
|
|
@@ -30,6 +30,7 @@ extra_rdoc_files: []
|
|
30
30
|
files:
|
31
31
|
- README.markdown
|
32
32
|
- lib/bayes_on_redis.rb
|
33
|
+
- datasets/stopwords.txt
|
33
34
|
has_rdoc: true
|
34
35
|
homepage: https://github.com/didip/bayes_on_redis
|
35
36
|
licenses: []
|