clarifier 0.0.3 → 0.9.0
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/CHANGELOG.md +3 -0
- data/Gemfile.lock +1 -1
- data/lib/clarifier/stop_words.rb +31 -11
- data/lib/clarifier/version.rb +1 -1
- data/test/stop_words_test.rb +8 -1
- metadata +3 -3
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA1:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: a284e14c6ff8275e044ef65de1103c50f904b015
|
4
|
+
data.tar.gz: 89c5f5b8d614acd0b02efe2bb4c6fa117f225cc5
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: fabd1582bfe6cf87592e7c5c4bf0264beb3f02b7d173f0716033ad7908edd1ba9d8e88ee323f1fe818ba54ca201df14d62df4f12fec6da945cd74d3a802b172d
|
7
|
+
data.tar.gz: a337256cf75701be19943bfdf21217baecff51e1100096b1f03a98582f117df8bd76ad5e5a69b5d6875a247645dfff6a9fc50b5cc66a05420b2bbd97011f222e
|
data/CHANGELOG.md
CHANGED
data/Gemfile.lock
CHANGED
data/lib/clarifier/stop_words.rb
CHANGED
@@ -9,7 +9,7 @@ module Clarifier
|
|
9
9
|
@@lists
|
10
10
|
end
|
11
11
|
|
12
|
-
def initialize(desired_stopwords = nil)
|
12
|
+
def initialize(desired_stopwords = nil, training_threshold = 0.8)
|
13
13
|
if desired_stopwords.kind_of?(Array)
|
14
14
|
@stopwords = desired_stopwords
|
15
15
|
elsif @@lists[desired_stopwords]
|
@@ -17,6 +17,7 @@ module Clarifier
|
|
17
17
|
else
|
18
18
|
@stopwords = @@lists[:en_gb_basic]
|
19
19
|
end
|
20
|
+
@training_threshold = training_threshold
|
20
21
|
end
|
21
22
|
|
22
23
|
def clarify(input)
|
@@ -32,18 +33,37 @@ module Clarifier
|
|
32
33
|
new_string
|
33
34
|
end
|
34
35
|
|
35
|
-
def
|
36
|
-
word_counts =
|
36
|
+
def reset
|
37
|
+
@word_counts = Hash.new(0)
|
38
|
+
@training_doc_count = 0
|
39
|
+
@stopwords = []
|
40
|
+
end
|
41
|
+
|
42
|
+
def refine(doc)
|
43
|
+
@word_counts ||= Hash.new(0)
|
44
|
+
@training_doc_count ||= 0
|
45
|
+
@training_doc_count += 1
|
46
|
+
words = doc.split
|
47
|
+
words.uniq!
|
48
|
+
words.each do |word|
|
49
|
+
@word_counts[word] += 1
|
50
|
+
end
|
51
|
+
select_stopwords_from_training
|
52
|
+
end
|
53
|
+
|
54
|
+
def train(docs, threshold = @training_threshold)
|
55
|
+
@word_counts = Hash.new(0)
|
56
|
+
@training_doc_count = 0
|
57
|
+
@training_threshold = threshold
|
37
58
|
docs.each do |doc|
|
38
|
-
|
39
|
-
words.uniq!
|
40
|
-
words.each do |word|
|
41
|
-
word_counts[word] ||= 0
|
42
|
-
word_counts[word] += 1
|
43
|
-
end
|
59
|
+
refine(doc)
|
44
60
|
end
|
45
|
-
|
46
|
-
|
61
|
+
end
|
62
|
+
|
63
|
+
def select_stopwords_from_training
|
64
|
+
@stopwords = []
|
65
|
+
@word_counts.each do |word, count|
|
66
|
+
if count.to_f / @training_doc_count >= @training_threshold
|
47
67
|
@stopwords << word
|
48
68
|
end
|
49
69
|
end
|
data/lib/clarifier/version.rb
CHANGED
data/test/stop_words_test.rb
CHANGED
@@ -45,7 +45,14 @@ module Clarifier
|
|
45
45
|
end
|
46
46
|
|
47
47
|
def test_incrementally_derive_a_stopword_list_from_docs
|
48
|
-
|
48
|
+
sw = Clarifier::StopWords.new([], 0.2)
|
49
|
+
StopWordsTestDocs.docs.each do |doc|
|
50
|
+
sw.refine(doc)
|
51
|
+
end
|
52
|
+
expected = %w(today we are going to discuss about the heart disease first for a run of concept and then we'll see what is between but at actually three concepts number one should special type whole time with people magic this an class workers that have up infection mission let's take our book bad give here or really back in which end bikini can inside them does especially by let me tell you particular produced who it don't know occurs because these dot part i when grow on its free political so hard out air your into that's true indeed talking right now has be across done must not go girl did if more supreme some over pretty friend like as bed system next their it's working think will start doesn't off there area present from good they stop him those diagram news red blood cell size cells regular he caused either environment two examples even pick way his side years easy however test many again chronic body were peers six having work tonight defined high loss jeans term process involves nothing help yeah get put world getting I'm waking doing us how lives living I do arm almost things same such close life hands you're just possible they're gonna happen down choose other rest changed tend little bit want car look mom looking kids kinda each she's got leg trying very quickly thing anything sorta yet case use only nineteen seventy medical been school medicine center health access patients proud last we're near new eighty nine group question towards complicated sometimes called able consistent too goes different both say he's all bodies physiology personal eyes no person until past within conditions would point come levels identify essential court well state act another against sort own view i'll refer make important under sir challenge face understanding uh... might began critical much sixty minutes sells understand week talk study hopefully lecture why won't live long without normal kidney chapter gross aspects differences fluid thought talked was created lot quite means show may block blue box around you'll remember low cast nerve signal four cardiac muscle contract yes action brains am could knowing always keep difficult general excess somehow sure left ask after pass basically said involved my young become fine spreading I'll through change maybe okay research most where buildings sections example find terrible cold coast something seeing moving mention boy fibers clearly underneath nuclei middle better try nice ah... took closely also bridge jason contraction ever ice had top trailer behind press didn't realize later seven brain hook surgery spinal cord)
|
53
|
+
assert_equal expected, sw.stopwords
|
54
|
+
sw.reset
|
55
|
+
assert_equal [], sw.stopwords
|
49
56
|
end
|
50
57
|
|
51
58
|
end
|
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: clarifier
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.0
|
4
|
+
version: 0.9.0
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Rob Styles
|
8
8
|
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date: 2014-
|
11
|
+
date: 2014-12-18 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: bundler
|
@@ -111,7 +111,7 @@ required_rubygems_version: !ruby/object:Gem::Requirement
|
|
111
111
|
version: '0'
|
112
112
|
requirements: []
|
113
113
|
rubyforge_project:
|
114
|
-
rubygems_version: 2.2.
|
114
|
+
rubygems_version: 2.2.2
|
115
115
|
signing_key:
|
116
116
|
specification_version: 4
|
117
117
|
summary: Clarifier is a stopwords library for removing common words from text
|