rbtagger 0.3.1 → 0.3.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -9,6 +9,6 @@ if !have_func('snprintf', 'stdio.h')
9
9
  raise "You must have snprintf available to compile this library"
10
10
  end
11
11
 
12
- CFLAGS='-Wall -g'
12
+ CFLAGS='-Wall'
13
13
 
14
14
  create_makefile('rule_tagger')
@@ -1,6 +1,6 @@
1
1
  have_header: checking for stdlib.h... -------------------- yes
2
2
 
3
- "/usr/bin/cpp-4.0 -I. -I/opt/local/lib/ruby/1.8/i686-darwin9.6.0 -I. -I/opt/local/include -D_XOPEN_SOURCE -D_DARWIN_C_SOURCE -I/opt/local/include -O2 -fno-common -pipe -fno-common conftest.c -o conftest.i"
3
+ "gcc -E -I. -I/usr/lib64/ruby/1.8/x86_64-linux -I. -O2 -g -pipe -Wall -Wp,-D_FORTIFY_SOURCE=2 -fexceptions -fstack-protector --param=ssp-buffer-size=4 -m64 -mtune=generic -fno-strict-aliasing -fPIC conftest.c -o conftest.i"
4
4
  checked program was:
5
5
  /* begin */
6
6
  1: #include <stdlib.h>
@@ -10,7 +10,7 @@ checked program was:
10
10
 
11
11
  have_header: checking for string.h... -------------------- yes
12
12
 
13
- "/usr/bin/cpp-4.0 -I. -I/opt/local/lib/ruby/1.8/i686-darwin9.6.0 -I. -I/opt/local/include -D_XOPEN_SOURCE -D_DARWIN_C_SOURCE -I/opt/local/include -O2 -fno-common -pipe -fno-common conftest.c -o conftest.i"
13
+ "gcc -E -I. -I/usr/lib64/ruby/1.8/x86_64-linux -I. -O2 -g -pipe -Wall -Wp,-D_FORTIFY_SOURCE=2 -fexceptions -fstack-protector --param=ssp-buffer-size=4 -m64 -mtune=generic -fno-strict-aliasing -fPIC conftest.c -o conftest.i"
14
14
  checked program was:
15
15
  /* begin */
16
16
  1: #include <string.h>
@@ -20,7 +20,7 @@ checked program was:
20
20
 
21
21
  have_library: checking for main() in -lc... -------------------- yes
22
22
 
23
- "/usr/bin/gcc-4.0 -o conftest -I. -I/opt/local/lib/ruby/1.8/i686-darwin9.6.0 -I. -I/opt/local/include -D_XOPEN_SOURCE -D_DARWIN_C_SOURCE -I/opt/local/include -O2 -fno-common -pipe -fno-common conftest.c -L. -L/opt/local/lib -L. -L/opt/local/lib -lruby-static -lc -lpthread -ldl -lobjc "
23
+ "gcc -o conftest -I. -I/usr/lib64/ruby/1.8/x86_64-linux -I. -O2 -g -pipe -Wall -Wp,-D_FORTIFY_SOURCE=2 -fexceptions -fstack-protector --param=ssp-buffer-size=4 -m64 -mtune=generic -fno-strict-aliasing -fPIC conftest.c -L. -L/usr/lib64 -L. -rdynamic -Wl,-export-dynamic -lruby-static -lc -lpthread -ldl -lcrypt -lm -lc"
24
24
  checked program was:
25
25
  /* begin */
26
26
  1: /*top*/
@@ -32,7 +32,7 @@ checked program was:
32
32
 
33
33
  have_func: checking for snprintf() in stdio.h... -------------------- yes
34
34
 
35
- "/usr/bin/gcc-4.0 -o conftest -I. -I/opt/local/lib/ruby/1.8/i686-darwin9.6.0 -I. -I/opt/local/include -D_XOPEN_SOURCE -D_DARWIN_C_SOURCE -I/opt/local/include -O2 -fno-common -pipe -fno-common conftest.c -L. -L/opt/local/lib -L. -L/opt/local/lib -lc -lruby-static -lc -lpthread -ldl -lobjc "
35
+ "gcc -o conftest -I. -I/usr/lib64/ruby/1.8/x86_64-linux -I. -O2 -g -pipe -Wall -Wp,-D_FORTIFY_SOURCE=2 -fexceptions -fstack-protector --param=ssp-buffer-size=4 -m64 -mtune=generic -fno-strict-aliasing -fPIC conftest.c -L. -L/usr/lib64 -L. -rdynamic -Wl,-export-dynamic -lc -lruby-static -lc -lpthread -ldl -lcrypt -lm -lc"
36
36
  checked program was:
37
37
  /* begin */
38
38
  1: #include <stdio.h>
@@ -1,4 +1,5 @@
1
1
  require 'mkmf'
2
+ $CFLAGS << ' -Wall'
2
3
 
3
4
  dir_config("word_tagger")
4
5
  have_library("c", "main")
@@ -1,6 +1,6 @@
1
1
  have_library: checking for main() in -lc... -------------------- yes
2
2
 
3
- "/usr/bin/gcc-4.0 -o conftest -I. -I/opt/local/lib/ruby/1.8/i686-darwin9.6.0 -I. -I/opt/local/include -D_XOPEN_SOURCE -D_DARWIN_C_SOURCE -I/opt/local/include -O2 -fno-common -pipe -fno-common conftest.c -L. -L/opt/local/lib -L. -L/opt/local/lib -lruby-static -lc -lpthread -ldl -lobjc "
3
+ "gcc -o conftest -I. -I/usr/lib64/ruby/1.8/x86_64-linux -I. -O2 -g -pipe -Wall -Wp,-D_FORTIFY_SOURCE=2 -fexceptions -fstack-protector --param=ssp-buffer-size=4 -m64 -mtune=generic -fno-strict-aliasing -fPIC -Wall conftest.c -L. -L/usr/lib64 -L. -rdynamic -Wl,-export-dynamic -lruby-static -lc -lpthread -ldl -lcrypt -lm -lc"
4
4
  checked program was:
5
5
  /* begin */
6
6
  1: /*top*/
@@ -12,7 +12,7 @@ checked program was:
12
12
 
13
13
  have_library: checking for main() in -lstdc++... -------------------- yes
14
14
 
15
- "/usr/bin/gcc-4.0 -o conftest -I. -I/opt/local/lib/ruby/1.8/i686-darwin9.6.0 -I. -I/opt/local/include -D_XOPEN_SOURCE -D_DARWIN_C_SOURCE -I/opt/local/include -O2 -fno-common -pipe -fno-common conftest.c -L. -L/opt/local/lib -L. -L/opt/local/lib -lc -lruby-static -lstdc++ -lc -lpthread -ldl -lobjc "
15
+ "gcc -o conftest -I. -I/usr/lib64/ruby/1.8/x86_64-linux -I. -O2 -g -pipe -Wall -Wp,-D_FORTIFY_SOURCE=2 -fexceptions -fstack-protector --param=ssp-buffer-size=4 -m64 -mtune=generic -fno-strict-aliasing -fPIC -Wall conftest.c -L. -L/usr/lib64 -L. -rdynamic -Wl,-export-dynamic -lc -lruby-static -lstdc++ -lc -lpthread -ldl -lcrypt -lm -lc"
16
16
  checked program was:
17
17
  /* begin */
18
18
  1: /*top*/
@@ -28,13 +28,27 @@ VALUE Tagger_execute( VALUE self, VALUE text )
28
28
  {
29
29
  NWordTagger *tagger;
30
30
  Data_Get_Struct( self, NWordTagger, tagger );
31
- std::vector<std::string> tags = tagger->execute( RSTRING_PTR(text) );
31
+ std::vector<std::string> tags;
32
+ tagger->execute( tags, RSTRING_PTR(text) );
32
33
  VALUE results = rb_ary_new2(tags.size());
33
- for( size_t i = 0; i < tags.size(); ++i ){
34
+ for( size_t i = 0; i < tags.size(); ++i ) {
34
35
  rb_ary_push( results, rb_str_new( tags[i].c_str(), tags[i].length() ) );
35
36
  }
36
37
  return results;
37
38
  }
39
+ VALUE Tagger_execute_freq( VALUE self, VALUE text )
40
+ {
41
+ NWordTagger *tagger;
42
+ Data_Get_Struct( self, NWordTagger, tagger );
43
+ int max_count = 0;
44
+ std::map<std::string,int> tags;
45
+ tagger->execute_with_frequency( RSTRING_PTR(text), tags, max_count );
46
+ VALUE results = rb_hash_new();
47
+ for( std::map<std::string,int>::const_iterator it = tags.begin(); it != tags.end(); ++it ) {
48
+ rb_hash_aset( results, rb_str_new(it->first.c_str(), it->first.length()), rb_int_new(it->second) );
49
+ }
50
+ return results;
51
+ }
38
52
  VALUE Tagger_set_words( VALUE self, VALUE words )
39
53
  {
40
54
  NWordTagger *tagger;
@@ -79,5 +93,6 @@ extern "C" void Init_word_tagger()
79
93
 
80
94
  rb_define_method( rb_NWordTagger, "load_tags", (VALUE (*)(...))Tagger_load_tags, 1 );
81
95
  rb_define_method( rb_NWordTagger, "execute", (VALUE (*)(...))Tagger_execute, 1 );
96
+ rb_define_method( rb_NWordTagger, "freq", (VALUE (*)(...))Tagger_execute_freq, 1 );
82
97
  rb_define_method( rb_NWordTagger, "set_words", (VALUE (*)(...))Tagger_set_words, 1 );
83
98
  }
@@ -37,7 +37,7 @@ static std::vector<std::string> word_split(const std::string& s)
37
37
 
38
38
  static void word_downcase( std::string &word )
39
39
  {
40
- for( int j = 0; j < word.size(); ++j ) {
40
+ for( std::string::size_type j = 0; j < word.size(); ++j ) {
41
41
  word[j] = tolower( word[j] );
42
42
  }
43
43
  }
@@ -82,11 +82,9 @@ std::string NWordTagger::stemWord( const std::string &word )const
82
82
  return stemmed;
83
83
  }
84
84
 
85
- std::vector<std::string> NWordTagger::execute( const char *text, short max )const
85
+ int NWordTagger::execute_with_frequency( const char *text, std::map<std::string,int> &matched_tags, int &max_count )const
86
86
  {
87
- int max_count = 0;
88
87
  std::vector<std::string> words = word_split( text );
89
- std::map<std::string, int> matched_tags; // stores tags and frequency
90
88
  std::string match_word;
91
89
  std::map<std::string,std::string>::const_iterator matched;
92
90
 
@@ -130,16 +128,23 @@ std::vector<std::string> NWordTagger::execute( const char *text, short max )cons
130
128
  }
131
129
  }
132
130
  }
133
-
134
- std::vector< std::string > reduced_tags;
131
+ return matched_tags.size();
132
+ }
133
+
134
+ int NWordTagger::execute( std::vector<std::string> &reduced_tags, const char *text, unsigned short max )const
135
+ {
136
+ int max_count = 0;
137
+ std::map<std::string, int> matched_tags; // stores tags and frequency
135
138
 
139
+ execute_with_frequency(text, matched_tags, max_count);
140
+
136
141
  // now we have a list of tags that match within the document text, check if we need to reduce the tags
137
142
  if( matched_tags.size() < max ) {
138
143
  // prepare the return vector
139
144
  for( std::map<std::string, int>::iterator mloc = matched_tags.begin(); mloc != matched_tags.end(); ++mloc ){
140
145
  reduced_tags.push_back( mloc->first );
141
146
  }
142
- return reduced_tags;
147
+ return reduced_tags.size();
143
148
  }
144
149
 
145
150
  // now that we have all the matched tags reduce to max using the tag frequency as a reduction measure
@@ -154,13 +159,12 @@ std::vector<std::string> NWordTagger::execute( const char *text, short max )cons
154
159
  // sort the tags in frequency order
155
160
  std::sort( sorted_tags.begin(), sorted_tags.end(), WordComparitor() );
156
161
 
157
-
158
162
  std::vector< std::pair<std::string, int> >::iterator mloc;
159
163
  do {
160
164
  for(mloc = sorted_tags.begin(); mloc != sorted_tags.end(); ++mloc ) {
161
165
  std::pair< std::string, int > word_freq = *mloc;
162
- printf( "word: %s, frequency: %d\n", word_freq.first.c_str(), word_freq.second );
163
- printf( "word: %s, frequency: %d\n", mloc->first.c_str(), mloc->second );
166
+ // printf( "word: %s, frequency: %d\n", word_freq.first.c_str(), word_freq.second );
167
+ // printf( "word: %s, frequency: %d\n", mloc->first.c_str(), mloc->second );
164
168
  if( word_freq.second < max_count ) {
165
169
  sorted_tags.erase( mloc );
166
170
  break;
@@ -172,5 +176,5 @@ std::vector<std::string> NWordTagger::execute( const char *text, short max )cons
172
176
  reduced_tags.push_back( sorted_tags[i].first );
173
177
  }
174
178
 
175
- return reduced_tags;
179
+ return matched_tags.size();
176
180
  }
@@ -14,7 +14,14 @@ struct NWordTagger {
14
14
  short getNWords()const{ return nwords; }
15
15
  void setNWords( short words ){ nwords = words; }
16
16
 
17
- std::vector<std::string> execute( const char *text, short max = 10 )const;
17
+ // return the number of matched tags
18
+ // fill results with matching tags in the text body
19
+ // keep the number of tags returned within the threshold of max. reducing tags by least frequent
20
+ int execute( std::vector<std::string> &reduced_tags, const char *text, unsigned short max = 10 )const;
21
+
22
+ // return the number of matched tags
23
+ // result is updated with a mapping of matched tags with their individual term frequency count
24
+ int execute_with_frequency( const char *text, std::map<std::string,int> &matched_tags, int &max_count )const;
18
25
  private:
19
26
  short nwords;
20
27
  struct stemmer *stemmer;
@@ -2,7 +2,7 @@ module RbTagger #:nodoc:
2
2
  module VERSION #:nodoc:
3
3
  MAJOR = 0
4
4
  MINOR = 3
5
- TINY = 1
5
+ TINY = 2
6
6
 
7
7
  STRING = [MAJOR, MINOR, TINY].join('.')
8
8
  end
@@ -117,14 +117,14 @@ TMZ.com: Britney celebrated getting overnights with her kids by going on a wild
117
117
  end
118
118
 
119
119
  def test_multiple_docs
120
- timer = Time.now
120
+ #timer = Time.now
121
121
  count = 0
122
122
  Dir["#{File.dirname(__FILE__)}/docs/doc*"].each do|doc|
123
123
  tagger.tag( File.read( doc ) )
124
124
  count += 1
125
125
  end
126
- duration = Time.now - timer
127
- puts "time: #{duration} sec #{count.to_f/duration} docs/sec"
126
+ #duration = Time.now - timer
127
+ #puts "time: #{duration} sec #{count.to_f/duration} docs/sec"
128
128
  end
129
129
 
130
130
  def test_suggest
@@ -136,7 +136,7 @@ TMZ.com: Britney celebrated getting overnights with her kids by going on a wild
136
136
  assert results.include?(["Jamie Spears", "NNP", 12])
137
137
  # puts results.inspect
138
138
  results = tagger.suggest( SAMPLE_DOC3, 5 )
139
- puts results.inspect
139
+ #puts results.inspect
140
140
  end
141
141
 
142
142
  def test_adjectives
@@ -9,11 +9,11 @@ class TestWordTagger < Test::Unit::TestCase
9
9
  end
10
10
 
11
11
  def test_basic
12
- timer = Time.now
12
+ #timer = Time.now
13
13
  text = "This is a sa'mple doc[]ument lets see how cancer ngrams 4 works out for this interesting text!"
14
14
  tags = $wtagger.execute( text )
15
15
  assert_equal ['cancer','work'], tags
16
- puts "Duration: #{Time.now - timer} sec"
16
+ #puts "Duration: #{Time.now - timer} sec"
17
17
  end
18
18
 
19
19
  def test_sample_bug
@@ -25,11 +25,11 @@ class TestWordTagger < Test::Unit::TestCase
25
25
  end
26
26
 
27
27
  def test_ngram_size3
28
- timer = Time.now
28
+ #timer = Time.now
29
29
  text = "This body of text contains something like ventricular septal defect"
30
30
  tags = $wtagger.execute( text )
31
31
  assert_equal ['ventricular septal defect'], tags
32
- puts "Duration: #{Time.now - timer} sec"
32
+ #puts "Duration: #{Time.now - timer} sec"
33
33
  end
34
34
 
35
35
  def test_cat_and_the_hat
@@ -38,4 +38,10 @@ class TestWordTagger < Test::Unit::TestCase
38
38
  assert_equal( ["Cat", "hat"], tags )
39
39
  end
40
40
 
41
+ def test_freq_counts
42
+ tagger = Word::Tagger.new( ['Cat','hat'], :words => 4 )
43
+ tags = tagger.freq( 'the cAt and the hat the cAt and the hat the cAt and the hat the cAt and the hat' )
44
+ assert_equal( {"Cat"=>4, "hat"=>4}, tags )
45
+ end
46
+
41
47
  end
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: rbtagger
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.3.1
4
+ version: 0.3.2
5
5
  platform: ruby
6
6
  authors:
7
7
  - Todd A. Fisher
@@ -9,7 +9,7 @@ autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
11
 
12
- date: 2009-05-21 00:00:00 -04:00
12
+ date: 2009-07-07 00:00:00 -04:00
13
13
  default_executable:
14
14
  dependencies:
15
15
  - !ruby/object:Gem::Dependency
@@ -20,7 +20,7 @@ dependencies:
20
20
  requirements:
21
21
  - - ">="
22
22
  - !ruby/object:Gem::Version
23
- version: 1.8.0
23
+ version: 2.3.2
24
24
  version:
25
25
  description: A Simple Ruby Rule-Based Part of Speech Tagger
26
26
  email:
@@ -28,8 +28,8 @@ email:
28
28
  executables: []
29
29
 
30
30
  extensions:
31
- - ext/rule_tagger/extconf.rb
32
31
  - ext/word_tagger/extconf.rb
32
+ - ext/rule_tagger/extconf.rb
33
33
  extra_rdoc_files:
34
34
  - History.txt
35
35
  - License.txt
@@ -37,17 +37,6 @@ extra_rdoc_files:
37
37
  - PostInstall.txt
38
38
  - README.txt
39
39
  - ext/word_tagger/test/doc.txt
40
- - test/docs/doc0.txt
41
- - test/docs/doc1.txt
42
- - test/docs/doc2.txt
43
- - test/docs/doc3.txt
44
- - test/docs/doc4.txt
45
- - test/docs/doc5.txt
46
- - test/docs/doc6.txt
47
- - test/docs/doc7.txt
48
- - test/docs/doc8.txt
49
- - test/docs/doc9.txt
50
- - test/fixtures/tags.txt
51
40
  - website/index.txt
52
41
  files:
53
42
  - COPYING
@@ -134,18 +123,16 @@ files:
134
123
  - website/template.html.erb
135
124
  has_rdoc: true
136
125
  homepage: http://rbtagger.rubyforge.org
137
- licenses: []
138
-
139
126
  post_install_message: |
140
127
  For more information on rbtagger, see http://rbtagger.rubyforge.org
141
128
 
142
129
  rdoc_options:
143
130
  - --main
144
- - README
131
+ - README.txt
145
132
  require_paths:
146
133
  - lib
147
- - ext/rule_tagger
148
134
  - ext/word_tagger
135
+ - ext/rule_tagger
149
136
  required_ruby_version: !ruby/object:Gem::Requirement
150
137
  requirements:
151
138
  - - ">="
@@ -161,11 +148,11 @@ required_rubygems_version: !ruby/object:Gem::Requirement
161
148
  requirements: []
162
149
 
163
150
  rubyforge_project: rbtagger
164
- rubygems_version: 1.3.2
151
+ rubygems_version: 1.3.1
165
152
  signing_key:
166
- specification_version: 3
153
+ specification_version: 2
167
154
  summary: A Simple Ruby Rule-Based Part of Speech Tagger
168
155
  test_files:
169
- - test/test_helper.rb
170
156
  - test/test_rule_tagger.rb
157
+ - test/test_helper.rb
171
158
  - test/test_word_tagger.rb