clarifier 0.0.3 → 0.9.0

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA1:
3
- metadata.gz: d26cc44221f2412fcd48d2463a2f306a6d8a5ea7
4
- data.tar.gz: 333525a5ca5934d42b2c69cb3c7901be9b8fc765
3
+ metadata.gz: a284e14c6ff8275e044ef65de1103c50f904b015
4
+ data.tar.gz: 89c5f5b8d614acd0b02efe2bb4c6fa117f225cc5
5
5
  SHA512:
6
- metadata.gz: 58533ded8c5c7bd74310a6db0eed6f082136288bd19d1e2627010df201057435344495526cc8683fee45e9f7ad206ca2f3bea6a74337a9561e46e781f967384d
7
- data.tar.gz: 41ded05e9f3e2b4fba1fb628946e2154ae98e719e125df36e5dff22475b73a9aeadff54f9d0212c33eda574451129079ec8ad4c6c3f359c564ec421cb17e0450
6
+ metadata.gz: fabd1582bfe6cf87592e7c5c4bf0264beb3f02b7d173f0716033ad7908edd1ba9d8e88ee323f1fe818ba54ca201df14d62df4f12fec6da945cd74d3a802b172d
7
+ data.tar.gz: a337256cf75701be19943bfdf21217baecff51e1100096b1f03a98582f117df8bd76ad5e5a69b5d6875a247645dfff6a9fc50b5cc66a05420b2bbd97011f222e
data/CHANGELOG.md CHANGED
@@ -1,3 +1,6 @@
1
+ # 0.9.0 / 2014-12-18
2
+ * [FEATURE] Incremental training of stopwords list using refine() and reset()
3
+
1
4
  # 0.0.3 / 2014-04-07
2
5
  * [BUGFIX] Simplified regex
3
6
 
data/Gemfile.lock CHANGED
@@ -1,7 +1,7 @@
1
1
  PATH
2
2
  remote: .
3
3
  specs:
4
- clarifier (0.0.3)
4
+ clarifier (0.9.0)
5
5
 
6
6
  GEM
7
7
  remote: https://rubygems.org/
@@ -9,7 +9,7 @@ module Clarifier
9
9
  @@lists
10
10
  end
11
11
 
12
- def initialize(desired_stopwords = nil)
12
+ def initialize(desired_stopwords = nil, training_threshold = 0.8)
13
13
  if desired_stopwords.kind_of?(Array)
14
14
  @stopwords = desired_stopwords
15
15
  elsif @@lists[desired_stopwords]
@@ -17,6 +17,7 @@ module Clarifier
17
17
  else
18
18
  @stopwords = @@lists[:en_gb_basic]
19
19
  end
20
+ @training_threshold = training_threshold
20
21
  end
21
22
 
22
23
  def clarify(input)
@@ -32,18 +33,37 @@ module Clarifier
32
33
  new_string
33
34
  end
34
35
 
35
- def train(docs, threshold = 0.8)
36
- word_counts = {}
36
+ def reset
37
+ @word_counts = Hash.new(0)
38
+ @training_doc_count = 0
39
+ @stopwords = []
40
+ end
41
+
42
+ def refine(doc)
43
+ @word_counts ||= Hash.new(0)
44
+ @training_doc_count ||= 0
45
+ @training_doc_count += 1
46
+ words = doc.split
47
+ words.uniq!
48
+ words.each do |word|
49
+ @word_counts[word] += 1
50
+ end
51
+ select_stopwords_from_training
52
+ end
53
+
54
+ def train(docs, threshold = @training_threshold)
55
+ @word_counts = Hash.new(0)
56
+ @training_doc_count = 0
57
+ @training_threshold = threshold
37
58
  docs.each do |doc|
38
- words = doc.split
39
- words.uniq!
40
- words.each do |word|
41
- word_counts[word] ||= 0
42
- word_counts[word] += 1
43
- end
59
+ refine(doc)
44
60
  end
45
- word_counts.each do |word,count|
46
- if count.to_f / docs.length >= threshold
61
+ end
62
+
63
+ def select_stopwords_from_training
64
+ @stopwords = []
65
+ @word_counts.each do |word, count|
66
+ if count.to_f / @training_doc_count >= @training_threshold
47
67
  @stopwords << word
48
68
  end
49
69
  end
@@ -1,3 +1,3 @@
1
1
  module Clarifier
2
- VERSION = "0.0.3"
2
+ VERSION = "0.9.0"
3
3
  end
@@ -45,7 +45,14 @@ module Clarifier
45
45
  end
46
46
 
47
47
  def test_incrementally_derive_a_stopword_list_from_docs
48
- skip
48
+ sw = Clarifier::StopWords.new([], 0.2)
49
+ StopWordsTestDocs.docs.each do |doc|
50
+ sw.refine(doc)
51
+ end
52
+ expected = %w(today we are going to discuss about the heart disease first for a run of concept and then we'll see what is between but at actually three concepts number one should special type whole time with people magic this an class workers that have up infection mission let's take our book bad give here or really back in which end bikini can inside them does especially by let me tell you particular produced who it don't know occurs because these dot part i when grow on its free political so hard out air your into that's true indeed talking right now has be across done must not go girl did if more supreme some over pretty friend like as bed system next their it's working think will start doesn't off there area present from good they stop him those diagram news red blood cell size cells regular he caused either environment two examples even pick way his side years easy however test many again chronic body were peers six having work tonight defined high loss jeans term process involves nothing help yeah get put world getting I'm waking doing us how lives living I do arm almost things same such close life hands you're just possible they're gonna happen down choose other rest changed tend little bit want car look mom looking kids kinda each she's got leg trying very quickly thing anything sorta yet case use only nineteen seventy medical been school medicine center health access patients proud last we're near new eighty nine group question towards complicated sometimes called able consistent too goes different both say he's all bodies physiology personal eyes no person until past within conditions would point come levels identify essential court well state act another against sort own view i'll refer make important under sir challenge face understanding uh... might began critical much sixty minutes sells understand week talk study hopefully lecture why won't live long without normal kidney chapter gross aspects differences fluid thought talked was created lot quite means show may block blue box around you'll remember low cast nerve signal four cardiac muscle contract yes action brains am could knowing always keep difficult general excess somehow sure left ask after pass basically said involved my young become fine spreading I'll through change maybe okay research most where buildings sections example find terrible cold coast something seeing moving mention boy fibers clearly underneath nuclei middle better try nice ah... took closely also bridge jason contraction ever ice had top trailer behind press didn't realize later seven brain hook surgery spinal cord)
53
+ assert_equal expected, sw.stopwords
54
+ sw.reset
55
+ assert_equal [], sw.stopwords
49
56
  end
50
57
 
51
58
  end
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: clarifier
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.0.3
4
+ version: 0.9.0
5
5
  platform: ruby
6
6
  authors:
7
7
  - Rob Styles
8
8
  autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
- date: 2014-04-07 00:00:00.000000000 Z
11
+ date: 2014-12-18 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: bundler
@@ -111,7 +111,7 @@ required_rubygems_version: !ruby/object:Gem::Requirement
111
111
  version: '0'
112
112
  requirements: []
113
113
  rubyforge_project:
114
- rubygems_version: 2.2.0
114
+ rubygems_version: 2.2.2
115
115
  signing_key:
116
116
  specification_version: 4
117
117
  summary: Clarifier is a stopwords library for removing common words from text