rbtagger 0.3.1 → 0.3.2

Sign up to get free protection for your applications and to get access to all the features.
@@ -9,6 +9,6 @@ if !have_func('snprintf', 'stdio.h')
9
9
  raise "You must have snprintf available to compile this library"
10
10
  end
11
11
 
12
- CFLAGS='-Wall -g'
12
+ CFLAGS='-Wall'
13
13
 
14
14
  create_makefile('rule_tagger')
@@ -1,6 +1,6 @@
1
1
  have_header: checking for stdlib.h... -------------------- yes
2
2
 
3
- "/usr/bin/cpp-4.0 -I. -I/opt/local/lib/ruby/1.8/i686-darwin9.6.0 -I. -I/opt/local/include -D_XOPEN_SOURCE -D_DARWIN_C_SOURCE -I/opt/local/include -O2 -fno-common -pipe -fno-common conftest.c -o conftest.i"
3
+ "gcc -E -I. -I/usr/lib64/ruby/1.8/x86_64-linux -I. -O2 -g -pipe -Wall -Wp,-D_FORTIFY_SOURCE=2 -fexceptions -fstack-protector --param=ssp-buffer-size=4 -m64 -mtune=generic -fno-strict-aliasing -fPIC conftest.c -o conftest.i"
4
4
  checked program was:
5
5
  /* begin */
6
6
  1: #include <stdlib.h>
@@ -10,7 +10,7 @@ checked program was:
10
10
 
11
11
  have_header: checking for string.h... -------------------- yes
12
12
 
13
- "/usr/bin/cpp-4.0 -I. -I/opt/local/lib/ruby/1.8/i686-darwin9.6.0 -I. -I/opt/local/include -D_XOPEN_SOURCE -D_DARWIN_C_SOURCE -I/opt/local/include -O2 -fno-common -pipe -fno-common conftest.c -o conftest.i"
13
+ "gcc -E -I. -I/usr/lib64/ruby/1.8/x86_64-linux -I. -O2 -g -pipe -Wall -Wp,-D_FORTIFY_SOURCE=2 -fexceptions -fstack-protector --param=ssp-buffer-size=4 -m64 -mtune=generic -fno-strict-aliasing -fPIC conftest.c -o conftest.i"
14
14
  checked program was:
15
15
  /* begin */
16
16
  1: #include <string.h>
@@ -20,7 +20,7 @@ checked program was:
20
20
 
21
21
  have_library: checking for main() in -lc... -------------------- yes
22
22
 
23
- "/usr/bin/gcc-4.0 -o conftest -I. -I/opt/local/lib/ruby/1.8/i686-darwin9.6.0 -I. -I/opt/local/include -D_XOPEN_SOURCE -D_DARWIN_C_SOURCE -I/opt/local/include -O2 -fno-common -pipe -fno-common conftest.c -L. -L/opt/local/lib -L. -L/opt/local/lib -lruby-static -lc -lpthread -ldl -lobjc "
23
+ "gcc -o conftest -I. -I/usr/lib64/ruby/1.8/x86_64-linux -I. -O2 -g -pipe -Wall -Wp,-D_FORTIFY_SOURCE=2 -fexceptions -fstack-protector --param=ssp-buffer-size=4 -m64 -mtune=generic -fno-strict-aliasing -fPIC conftest.c -L. -L/usr/lib64 -L. -rdynamic -Wl,-export-dynamic -lruby-static -lc -lpthread -ldl -lcrypt -lm -lc"
24
24
  checked program was:
25
25
  /* begin */
26
26
  1: /*top*/
@@ -32,7 +32,7 @@ checked program was:
32
32
 
33
33
  have_func: checking for snprintf() in stdio.h... -------------------- yes
34
34
 
35
- "/usr/bin/gcc-4.0 -o conftest -I. -I/opt/local/lib/ruby/1.8/i686-darwin9.6.0 -I. -I/opt/local/include -D_XOPEN_SOURCE -D_DARWIN_C_SOURCE -I/opt/local/include -O2 -fno-common -pipe -fno-common conftest.c -L. -L/opt/local/lib -L. -L/opt/local/lib -lc -lruby-static -lc -lpthread -ldl -lobjc "
35
+ "gcc -o conftest -I. -I/usr/lib64/ruby/1.8/x86_64-linux -I. -O2 -g -pipe -Wall -Wp,-D_FORTIFY_SOURCE=2 -fexceptions -fstack-protector --param=ssp-buffer-size=4 -m64 -mtune=generic -fno-strict-aliasing -fPIC conftest.c -L. -L/usr/lib64 -L. -rdynamic -Wl,-export-dynamic -lc -lruby-static -lc -lpthread -ldl -lcrypt -lm -lc"
36
36
  checked program was:
37
37
  /* begin */
38
38
  1: #include <stdio.h>
@@ -1,4 +1,5 @@
1
1
  require 'mkmf'
2
+ $CFLAGS << ' -Wall'
2
3
 
3
4
  dir_config("word_tagger")
4
5
  have_library("c", "main")
@@ -1,6 +1,6 @@
1
1
  have_library: checking for main() in -lc... -------------------- yes
2
2
 
3
- "/usr/bin/gcc-4.0 -o conftest -I. -I/opt/local/lib/ruby/1.8/i686-darwin9.6.0 -I. -I/opt/local/include -D_XOPEN_SOURCE -D_DARWIN_C_SOURCE -I/opt/local/include -O2 -fno-common -pipe -fno-common conftest.c -L. -L/opt/local/lib -L. -L/opt/local/lib -lruby-static -lc -lpthread -ldl -lobjc "
3
+ "gcc -o conftest -I. -I/usr/lib64/ruby/1.8/x86_64-linux -I. -O2 -g -pipe -Wall -Wp,-D_FORTIFY_SOURCE=2 -fexceptions -fstack-protector --param=ssp-buffer-size=4 -m64 -mtune=generic -fno-strict-aliasing -fPIC -Wall conftest.c -L. -L/usr/lib64 -L. -rdynamic -Wl,-export-dynamic -lruby-static -lc -lpthread -ldl -lcrypt -lm -lc"
4
4
  checked program was:
5
5
  /* begin */
6
6
  1: /*top*/
@@ -12,7 +12,7 @@ checked program was:
12
12
 
13
13
  have_library: checking for main() in -lstdc++... -------------------- yes
14
14
 
15
- "/usr/bin/gcc-4.0 -o conftest -I. -I/opt/local/lib/ruby/1.8/i686-darwin9.6.0 -I. -I/opt/local/include -D_XOPEN_SOURCE -D_DARWIN_C_SOURCE -I/opt/local/include -O2 -fno-common -pipe -fno-common conftest.c -L. -L/opt/local/lib -L. -L/opt/local/lib -lc -lruby-static -lstdc++ -lc -lpthread -ldl -lobjc "
15
+ "gcc -o conftest -I. -I/usr/lib64/ruby/1.8/x86_64-linux -I. -O2 -g -pipe -Wall -Wp,-D_FORTIFY_SOURCE=2 -fexceptions -fstack-protector --param=ssp-buffer-size=4 -m64 -mtune=generic -fno-strict-aliasing -fPIC -Wall conftest.c -L. -L/usr/lib64 -L. -rdynamic -Wl,-export-dynamic -lc -lruby-static -lstdc++ -lc -lpthread -ldl -lcrypt -lm -lc"
16
16
  checked program was:
17
17
  /* begin */
18
18
  1: /*top*/
@@ -28,13 +28,27 @@ VALUE Tagger_execute( VALUE self, VALUE text )
28
28
  {
29
29
  NWordTagger *tagger;
30
30
  Data_Get_Struct( self, NWordTagger, tagger );
31
- std::vector<std::string> tags = tagger->execute( RSTRING_PTR(text) );
31
+ std::vector<std::string> tags;
32
+ tagger->execute( tags, RSTRING_PTR(text) );
32
33
  VALUE results = rb_ary_new2(tags.size());
33
- for( size_t i = 0; i < tags.size(); ++i ){
34
+ for( size_t i = 0; i < tags.size(); ++i ) {
34
35
  rb_ary_push( results, rb_str_new( tags[i].c_str(), tags[i].length() ) );
35
36
  }
36
37
  return results;
37
38
  }
39
+ VALUE Tagger_execute_freq( VALUE self, VALUE text )
40
+ {
41
+ NWordTagger *tagger;
42
+ Data_Get_Struct( self, NWordTagger, tagger );
43
+ int max_count = 0;
44
+ std::map<std::string,int> tags;
45
+ tagger->execute_with_frequency( RSTRING_PTR(text), tags, max_count );
46
+ VALUE results = rb_hash_new();
47
+ for( std::map<std::string,int>::const_iterator it = tags.begin(); it != tags.end(); ++it ) {
48
+ rb_hash_aset( results, rb_str_new(it->first.c_str(), it->first.length()), rb_int_new(it->second) );
49
+ }
50
+ return results;
51
+ }
38
52
  VALUE Tagger_set_words( VALUE self, VALUE words )
39
53
  {
40
54
  NWordTagger *tagger;
@@ -79,5 +93,6 @@ extern "C" void Init_word_tagger()
79
93
 
80
94
  rb_define_method( rb_NWordTagger, "load_tags", (VALUE (*)(...))Tagger_load_tags, 1 );
81
95
  rb_define_method( rb_NWordTagger, "execute", (VALUE (*)(...))Tagger_execute, 1 );
96
+ rb_define_method( rb_NWordTagger, "freq", (VALUE (*)(...))Tagger_execute_freq, 1 );
82
97
  rb_define_method( rb_NWordTagger, "set_words", (VALUE (*)(...))Tagger_set_words, 1 );
83
98
  }
@@ -37,7 +37,7 @@ static std::vector<std::string> word_split(const std::string& s)
37
37
 
38
38
  static void word_downcase( std::string &word )
39
39
  {
40
- for( int j = 0; j < word.size(); ++j ) {
40
+ for( std::string::size_type j = 0; j < word.size(); ++j ) {
41
41
  word[j] = tolower( word[j] );
42
42
  }
43
43
  }
@@ -82,11 +82,9 @@ std::string NWordTagger::stemWord( const std::string &word )const
82
82
  return stemmed;
83
83
  }
84
84
 
85
- std::vector<std::string> NWordTagger::execute( const char *text, short max )const
85
+ int NWordTagger::execute_with_frequency( const char *text, std::map<std::string,int> &matched_tags, int &max_count )const
86
86
  {
87
- int max_count = 0;
88
87
  std::vector<std::string> words = word_split( text );
89
- std::map<std::string, int> matched_tags; // stores tags and frequency
90
88
  std::string match_word;
91
89
  std::map<std::string,std::string>::const_iterator matched;
92
90
 
@@ -130,16 +128,23 @@ std::vector<std::string> NWordTagger::execute( const char *text, short max )cons
130
128
  }
131
129
  }
132
130
  }
133
-
134
- std::vector< std::string > reduced_tags;
131
+ return matched_tags.size();
132
+ }
133
+
134
+ int NWordTagger::execute( std::vector<std::string> &reduced_tags, const char *text, unsigned short max )const
135
+ {
136
+ int max_count = 0;
137
+ std::map<std::string, int> matched_tags; // stores tags and frequency
135
138
 
139
+ execute_with_frequency(text, matched_tags, max_count);
140
+
136
141
  // now we have a list of tags that match within the document text, check if we need to reduce the tags
137
142
  if( matched_tags.size() < max ) {
138
143
  // prepare the return vector
139
144
  for( std::map<std::string, int>::iterator mloc = matched_tags.begin(); mloc != matched_tags.end(); ++mloc ){
140
145
  reduced_tags.push_back( mloc->first );
141
146
  }
142
- return reduced_tags;
147
+ return reduced_tags.size();
143
148
  }
144
149
 
145
150
  // now that we have all the matched tags reduce to max using the tag frequency as a reduction measure
@@ -154,13 +159,12 @@ std::vector<std::string> NWordTagger::execute( const char *text, short max )cons
154
159
  // sort the tags in frequency order
155
160
  std::sort( sorted_tags.begin(), sorted_tags.end(), WordComparitor() );
156
161
 
157
-
158
162
  std::vector< std::pair<std::string, int> >::iterator mloc;
159
163
  do {
160
164
  for(mloc = sorted_tags.begin(); mloc != sorted_tags.end(); ++mloc ) {
161
165
  std::pair< std::string, int > word_freq = *mloc;
162
- printf( "word: %s, frequency: %d\n", word_freq.first.c_str(), word_freq.second );
163
- printf( "word: %s, frequency: %d\n", mloc->first.c_str(), mloc->second );
166
+ // printf( "word: %s, frequency: %d\n", word_freq.first.c_str(), word_freq.second );
167
+ // printf( "word: %s, frequency: %d\n", mloc->first.c_str(), mloc->second );
164
168
  if( word_freq.second < max_count ) {
165
169
  sorted_tags.erase( mloc );
166
170
  break;
@@ -172,5 +176,5 @@ std::vector<std::string> NWordTagger::execute( const char *text, short max )cons
172
176
  reduced_tags.push_back( sorted_tags[i].first );
173
177
  }
174
178
 
175
- return reduced_tags;
179
+ return matched_tags.size();
176
180
  }
@@ -14,7 +14,14 @@ struct NWordTagger {
14
14
  short getNWords()const{ return nwords; }
15
15
  void setNWords( short words ){ nwords = words; }
16
16
 
17
- std::vector<std::string> execute( const char *text, short max = 10 )const;
17
+ // return the number of matched tags
18
+ // fill results with matching tags in the text body
19
+ // keep the number of tags returned within the threshold of max. reducing tags by least frequent
20
+ int execute( std::vector<std::string> &reduced_tags, const char *text, unsigned short max = 10 )const;
21
+
22
+ // return the number of matched tags
23
+ // result is updated with a mapping of matched tags with their individual term frequency count
24
+ int execute_with_frequency( const char *text, std::map<std::string,int> &matched_tags, int &max_count )const;
18
25
  private:
19
26
  short nwords;
20
27
  struct stemmer *stemmer;
@@ -2,7 +2,7 @@ module RbTagger #:nodoc:
2
2
  module VERSION #:nodoc:
3
3
  MAJOR = 0
4
4
  MINOR = 3
5
- TINY = 1
5
+ TINY = 2
6
6
 
7
7
  STRING = [MAJOR, MINOR, TINY].join('.')
8
8
  end
@@ -117,14 +117,14 @@ TMZ.com: Britney celebrated getting overnights with her kids by going on a wild
117
117
  end
118
118
 
119
119
  def test_multiple_docs
120
- timer = Time.now
120
+ #timer = Time.now
121
121
  count = 0
122
122
  Dir["#{File.dirname(__FILE__)}/docs/doc*"].each do|doc|
123
123
  tagger.tag( File.read( doc ) )
124
124
  count += 1
125
125
  end
126
- duration = Time.now - timer
127
- puts "time: #{duration} sec #{count.to_f/duration} docs/sec"
126
+ #duration = Time.now - timer
127
+ #puts "time: #{duration} sec #{count.to_f/duration} docs/sec"
128
128
  end
129
129
 
130
130
  def test_suggest
@@ -136,7 +136,7 @@ TMZ.com: Britney celebrated getting overnights with her kids by going on a wild
136
136
  assert results.include?(["Jamie Spears", "NNP", 12])
137
137
  # puts results.inspect
138
138
  results = tagger.suggest( SAMPLE_DOC3, 5 )
139
- puts results.inspect
139
+ #puts results.inspect
140
140
  end
141
141
 
142
142
  def test_adjectives
@@ -9,11 +9,11 @@ class TestWordTagger < Test::Unit::TestCase
9
9
  end
10
10
 
11
11
  def test_basic
12
- timer = Time.now
12
+ #timer = Time.now
13
13
  text = "This is a sa'mple doc[]ument lets see how cancer ngrams 4 works out for this interesting text!"
14
14
  tags = $wtagger.execute( text )
15
15
  assert_equal ['cancer','work'], tags
16
- puts "Duration: #{Time.now - timer} sec"
16
+ #puts "Duration: #{Time.now - timer} sec"
17
17
  end
18
18
 
19
19
  def test_sample_bug
@@ -25,11 +25,11 @@ class TestWordTagger < Test::Unit::TestCase
25
25
  end
26
26
 
27
27
  def test_ngram_size3
28
- timer = Time.now
28
+ #timer = Time.now
29
29
  text = "This body of text contains something like ventricular septal defect"
30
30
  tags = $wtagger.execute( text )
31
31
  assert_equal ['ventricular septal defect'], tags
32
- puts "Duration: #{Time.now - timer} sec"
32
+ #puts "Duration: #{Time.now - timer} sec"
33
33
  end
34
34
 
35
35
  def test_cat_and_the_hat
@@ -38,4 +38,10 @@ class TestWordTagger < Test::Unit::TestCase
38
38
  assert_equal( ["Cat", "hat"], tags )
39
39
  end
40
40
 
41
+ def test_freq_counts
42
+ tagger = Word::Tagger.new( ['Cat','hat'], :words => 4 )
43
+ tags = tagger.freq( 'the cAt and the hat the cAt and the hat the cAt and the hat the cAt and the hat' )
44
+ assert_equal( {"Cat"=>4, "hat"=>4}, tags )
45
+ end
46
+
41
47
  end
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: rbtagger
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.3.1
4
+ version: 0.3.2
5
5
  platform: ruby
6
6
  authors:
7
7
  - Todd A. Fisher
@@ -9,7 +9,7 @@ autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
11
 
12
- date: 2009-05-21 00:00:00 -04:00
12
+ date: 2009-07-07 00:00:00 -04:00
13
13
  default_executable:
14
14
  dependencies:
15
15
  - !ruby/object:Gem::Dependency
@@ -20,7 +20,7 @@ dependencies:
20
20
  requirements:
21
21
  - - ">="
22
22
  - !ruby/object:Gem::Version
23
- version: 1.8.0
23
+ version: 2.3.2
24
24
  version:
25
25
  description: A Simple Ruby Rule-Based Part of Speech Tagger
26
26
  email:
@@ -28,8 +28,8 @@ email:
28
28
  executables: []
29
29
 
30
30
  extensions:
31
- - ext/rule_tagger/extconf.rb
32
31
  - ext/word_tagger/extconf.rb
32
+ - ext/rule_tagger/extconf.rb
33
33
  extra_rdoc_files:
34
34
  - History.txt
35
35
  - License.txt
@@ -37,17 +37,6 @@ extra_rdoc_files:
37
37
  - PostInstall.txt
38
38
  - README.txt
39
39
  - ext/word_tagger/test/doc.txt
40
- - test/docs/doc0.txt
41
- - test/docs/doc1.txt
42
- - test/docs/doc2.txt
43
- - test/docs/doc3.txt
44
- - test/docs/doc4.txt
45
- - test/docs/doc5.txt
46
- - test/docs/doc6.txt
47
- - test/docs/doc7.txt
48
- - test/docs/doc8.txt
49
- - test/docs/doc9.txt
50
- - test/fixtures/tags.txt
51
40
  - website/index.txt
52
41
  files:
53
42
  - COPYING
@@ -134,18 +123,16 @@ files:
134
123
  - website/template.html.erb
135
124
  has_rdoc: true
136
125
  homepage: http://rbtagger.rubyforge.org
137
- licenses: []
138
-
139
126
  post_install_message: |
140
127
  For more information on rbtagger, see http://rbtagger.rubyforge.org
141
128
 
142
129
  rdoc_options:
143
130
  - --main
144
- - README
131
+ - README.txt
145
132
  require_paths:
146
133
  - lib
147
- - ext/rule_tagger
148
134
  - ext/word_tagger
135
+ - ext/rule_tagger
149
136
  required_ruby_version: !ruby/object:Gem::Requirement
150
137
  requirements:
151
138
  - - ">="
@@ -161,11 +148,11 @@ required_rubygems_version: !ruby/object:Gem::Requirement
161
148
  requirements: []
162
149
 
163
150
  rubyforge_project: rbtagger
164
- rubygems_version: 1.3.2
151
+ rubygems_version: 1.3.1
165
152
  signing_key:
166
- specification_version: 3
153
+ specification_version: 2
167
154
  summary: A Simple Ruby Rule-Based Part of Speech Tagger
168
155
  test_files:
169
- - test/test_helper.rb
170
156
  - test/test_rule_tagger.rb
157
+ - test/test_helper.rb
171
158
  - test/test_word_tagger.rb