RubyGems - rbtagger - Versions diffs - 0.3.1 → 0.3.2 - Mend

rbtagger 0.3.1 → 0.3.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (11) hide show

data/ext/rule_tagger/extconf.rb CHANGED

@@ -9,6 +9,6 @@ if !have_func('snprintf', 'stdio.h')
   raise "You must have snprintf available to compile this library"
 end
-CFLAGS='-Wall -g'
+CFLAGS='-Wall'
 create_makefile('rule_tagger')

data/ext/rule_tagger/mkmf.log CHANGED

@@ -1,6 +1,6 @@
 have_header: checking for stdlib.h... -------------------- yes
-"/usr/bin/cpp-4.0 -I. -I/opt/local/lib/ruby/1.8/i686-darwin9.6.0 -I. -I/opt/local/include -D_XOPEN_SOURCE -D_DARWIN_C_SOURCE  -I/opt/local/include -O2  -fno-common -pipe -fno-common    conftest.c -o conftest.i"
+"gcc -E -I. -I/usr/lib64/ruby/1.8/x86_64-linux -I.   -O2 -g -pipe -Wall -Wp,-D_FORTIFY_SOURCE=2 -fexceptions -fstack-protector --param=ssp-buffer-size=4 -m64 -mtune=generic -fno-strict-aliasing  -fPIC  conftest.c -o conftest.i"
 checked program was:
 /* begin */
 1: #include <stdlib.h>
@@ -10,7 +10,7 @@ checked program was:
 have_header: checking for string.h... -------------------- yes
-"/usr/bin/cpp-4.0 -I. -I/opt/local/lib/ruby/1.8/i686-darwin9.6.0 -I. -I/opt/local/include -D_XOPEN_SOURCE -D_DARWIN_C_SOURCE  -I/opt/local/include -O2  -fno-common -pipe -fno-common    conftest.c -o conftest.i"
+"gcc -E -I. -I/usr/lib64/ruby/1.8/x86_64-linux -I.   -O2 -g -pipe -Wall -Wp,-D_FORTIFY_SOURCE=2 -fexceptions -fstack-protector --param=ssp-buffer-size=4 -m64 -mtune=generic -fno-strict-aliasing  -fPIC  conftest.c -o conftest.i"
 checked program was:
 /* begin */
 1: #include <string.h>
@@ -20,7 +20,7 @@ checked program was:
 have_library: checking for main() in -lc... -------------------- yes
-"/usr/bin/gcc-4.0 -o conftest -I. -I/opt/local/lib/ruby/1.8/i686-darwin9.6.0 -I. -I/opt/local/include -D_XOPEN_SOURCE -D_DARWIN_C_SOURCE  -I/opt/local/include -O2  -fno-common -pipe -fno-common   conftest.c  -L. -L/opt/local/lib -L. -L/opt/local/lib     -lruby-static -lc  -lpthread -ldl -lobjc  "
+"gcc -o conftest -I. -I/usr/lib64/ruby/1.8/x86_64-linux -I.   -O2 -g -pipe -Wall -Wp,-D_FORTIFY_SOURCE=2 -fexceptions -fstack-protector --param=ssp-buffer-size=4 -m64 -mtune=generic -fno-strict-aliasing  -fPIC conftest.c  -L. -L/usr/lib64 -L.  -rdynamic -Wl,-export-dynamic     -lruby-static -lc  -lpthread -ldl -lcrypt -lm   -lc"
 checked program was:
 /* begin */
 1: /*top*/
@@ -32,7 +32,7 @@ checked program was:
 have_func: checking for snprintf() in stdio.h... -------------------- yes
-"/usr/bin/gcc-4.0 -o conftest -I. -I/opt/local/lib/ruby/1.8/i686-darwin9.6.0 -I. -I/opt/local/include -D_XOPEN_SOURCE -D_DARWIN_C_SOURCE  -I/opt/local/include -O2  -fno-common -pipe -fno-common   conftest.c  -L. -L/opt/local/lib -L. -L/opt/local/lib    -lc  -lruby-static -lc  -lpthread -ldl -lobjc  "
+"gcc -o conftest -I. -I/usr/lib64/ruby/1.8/x86_64-linux -I.   -O2 -g -pipe -Wall -Wp,-D_FORTIFY_SOURCE=2 -fexceptions -fstack-protector --param=ssp-buffer-size=4 -m64 -mtune=generic -fno-strict-aliasing  -fPIC conftest.c  -L. -L/usr/lib64 -L.  -rdynamic -Wl,-export-dynamic    -lc  -lruby-static -lc  -lpthread -ldl -lcrypt -lm   -lc"
 checked program was:
 /* begin */
 1: #include <stdio.h>

data/ext/word_tagger/extconf.rb CHANGED

@@ -1,4 +1,5 @@
 require 'mkmf'
+$CFLAGS << ' -Wall'
 dir_config("word_tagger")
 have_library("c", "main")

data/ext/word_tagger/mkmf.log CHANGED

@@ -1,6 +1,6 @@
 have_library: checking for main() in -lc... -------------------- yes
-"/usr/bin/gcc-4.0 -o conftest -I. -I/opt/local/lib/ruby/1.8/i686-darwin9.6.0 -I. -I/opt/local/include -D_XOPEN_SOURCE -D_DARWIN_C_SOURCE  -I/opt/local/include -O2  -fno-common -pipe -fno-common   conftest.c  -L. -L/opt/local/lib -L. -L/opt/local/lib     -lruby-static -lc  -lpthread -ldl -lobjc  "
+"gcc -o conftest -I. -I/usr/lib64/ruby/1.8/x86_64-linux -I.   -O2 -g -pipe -Wall -Wp,-D_FORTIFY_SOURCE=2 -fexceptions -fstack-protector --param=ssp-buffer-size=4 -m64 -mtune=generic -fno-strict-aliasing  -fPIC -Wall conftest.c  -L. -L/usr/lib64 -L.  -rdynamic -Wl,-export-dynamic     -lruby-static -lc  -lpthread -ldl -lcrypt -lm   -lc"
 checked program was:
 /* begin */
 1: /*top*/
@@ -12,7 +12,7 @@ checked program was:
 have_library: checking for main() in -lstdc++... -------------------- yes
-"/usr/bin/gcc-4.0 -o conftest -I. -I/opt/local/lib/ruby/1.8/i686-darwin9.6.0 -I. -I/opt/local/include -D_XOPEN_SOURCE -D_DARWIN_C_SOURCE  -I/opt/local/include -O2  -fno-common -pipe -fno-common   conftest.c  -L. -L/opt/local/lib -L. -L/opt/local/lib    -lc  -lruby-static -lstdc++ -lc  -lpthread -ldl -lobjc  "
+"gcc -o conftest -I. -I/usr/lib64/ruby/1.8/x86_64-linux -I.   -O2 -g -pipe -Wall -Wp,-D_FORTIFY_SOURCE=2 -fexceptions -fstack-protector --param=ssp-buffer-size=4 -m64 -mtune=generic -fno-strict-aliasing  -fPIC -Wall conftest.c  -L. -L/usr/lib64 -L.  -rdynamic -Wl,-export-dynamic    -lc  -lruby-static -lstdc++ -lc  -lpthread -ldl -lcrypt -lm   -lc"
 checked program was:
 /* begin */
 1: /*top*/

data/ext/word_tagger/rtagger.cc CHANGED

@@ -28,13 +28,27 @@ VALUE Tagger_execute( VALUE self, VALUE text )
 {
   NWordTagger *tagger;
   Data_Get_Struct( self, NWordTagger, tagger );
-  std::vector<std::string> tags = tagger->execute( RSTRING_PTR(text) );
+  std::vector<std::string> tags;
+  tagger->execute( tags, RSTRING_PTR(text) );
   VALUE results = rb_ary_new2(tags.size());
-  for( size_t i = 0; i < tags.size(); ++i ){
+  for( size_t i = 0; i < tags.size(); ++i ) {
     rb_ary_push( results, rb_str_new( tags[i].c_str(), tags[i].length() ) );
   }
   return results;
 }
+VALUE Tagger_execute_freq( VALUE self, VALUE text )
+{
+  NWordTagger *tagger;
+  Data_Get_Struct( self, NWordTagger, tagger );
+  int max_count = 0;
+  std::map<std::string,int> tags;
+  tagger->execute_with_frequency( RSTRING_PTR(text), tags, max_count );
+  VALUE results = rb_hash_new();
+  for( std::map<std::string,int>::const_iterator it = tags.begin(); it != tags.end(); ++it ) {
+    rb_hash_aset( results, rb_str_new(it->first.c_str(), it->first.length()), rb_int_new(it->second) );
+  }
+  return results;
+}
 VALUE Tagger_set_words( VALUE self, VALUE words )
 {
   NWordTagger *tagger;
@@ -79,5 +93,6 @@ extern "C" void Init_word_tagger()
   rb_define_method( rb_NWordTagger, "load_tags", (VALUE (*)(...))Tagger_load_tags, 1 );
   rb_define_method( rb_NWordTagger, "execute", (VALUE (*)(...))Tagger_execute, 1 );
+  rb_define_method( rb_NWordTagger, "freq", (VALUE (*)(...))Tagger_execute_freq, 1 );
   rb_define_method( rb_NWordTagger, "set_words", (VALUE (*)(...))Tagger_set_words, 1 );
 }

data/ext/word_tagger/tagger.cc CHANGED

@@ -37,7 +37,7 @@ static std::vector<std::string> word_split(const std::string& s)
 static void word_downcase( std::string &word )
 {
-  for( int j = 0; j < word.size(); ++j ) {
+  for( std::string::size_type j = 0; j < word.size(); ++j ) {
     word[j] = tolower( word[j] );
   }
 }
@@ -82,11 +82,9 @@ std::string NWordTagger::stemWord( const std::string &word )const
   return stemmed;
 }
-std::vector<std::string> NWordTagger::execute( const char *text, short max )const
+int NWordTagger::execute_with_frequency( const char *text, std::map<std::string,int> &matched_tags, int &max_count )const
 {
-  int max_count = 0;
   std::vector<std::string> words = word_split( text );
-  std::map<std::string, int> matched_tags; // stores tags and frequency
   std::string match_word;
   std::map<std::string,std::string>::const_iterator matched;
@@ -130,16 +128,23 @@ std::vector<std::string> NWordTagger::execute( const char *text, short max )cons
       }
     }
   }
-  std::vector< std::string > reduced_tags;
+  return matched_tags.size();
+}
+int NWordTagger::execute( std::vector<std::string> &reduced_tags, const char *text, unsigned short max )const
+{
+  int max_count = 0;
+  std::map<std::string, int> matched_tags; // stores tags and frequency
+  execute_with_frequency(text, matched_tags, max_count);
   // now we have a list of tags that match within the document text, check if we need to reduce the tags
   if( matched_tags.size() < max ) {
     // prepare the return vector
     for( std::map<std::string, int>::iterator mloc = matched_tags.begin(); mloc != matched_tags.end(); ++mloc ){
       reduced_tags.push_back( mloc->first );
     }
-    return reduced_tags;
+    return reduced_tags.size();
   }
   // now that we have all the matched tags reduce to max using the tag frequency as a reduction measure
@@ -154,13 +159,12 @@ std::vector<std::string> NWordTagger::execute( const char *text, short max )cons
   // sort the tags in frequency order
   std::sort( sorted_tags.begin(), sorted_tags.end(), WordComparitor() );
   std::vector< std::pair<std::string, int> >::iterator mloc;
   do {
     for(mloc = sorted_tags.begin(); mloc != sorted_tags.end(); ++mloc ) {
       std::pair< std::string, int > word_freq = *mloc;
-      printf( "word: %s, frequency: %d\n", word_freq.first.c_str(), word_freq.second );
-      printf( "word: %s, frequency: %d\n", mloc->first.c_str(), mloc->second );
+//      printf( "word: %s, frequency: %d\n", word_freq.first.c_str(), word_freq.second );
+//      printf( "word: %s, frequency: %d\n", mloc->first.c_str(), mloc->second );
       if( word_freq.second < max_count ) {
         sorted_tags.erase( mloc );
         break;
@@ -172,5 +176,5 @@ std::vector<std::string> NWordTagger::execute( const char *text, short max )cons
     reduced_tags.push_back( sorted_tags[i].first );
   }
-  return reduced_tags;
+  return matched_tags.size();
 }

data/ext/word_tagger/tagger.h CHANGED

@@ -14,7 +14,14 @@ struct NWordTagger {
   short getNWords()const{ return nwords; }
   void setNWords( short words ){ nwords = words; }
-  std::vector<std::string> execute( const char *text, short max = 10 )const;
+  // return the number of matched tags
+  // fill results with matching tags in the text body
+  // keep the number of tags returned within the threshold of max. reducing tags by least frequent
+  int execute( std::vector<std::string> &reduced_tags, const char *text, unsigned short max = 10 )const;
+  // return the number of matched tags
+  // result is updated with a mapping of matched tags with their individual term frequency count
+  int execute_with_frequency( const char *text, std::map<std::string,int> &matched_tags, int &max_count )const;
 private:
   short nwords;
   struct stemmer *stemmer;

data/lib/rbtagger/version.rb CHANGED

@@ -2,7 +2,7 @@ module RbTagger #:nodoc:
   module VERSION #:nodoc:
     MAJOR = 0
     MINOR = 3
-    TINY  = 1
+    TINY  = 2
     STRING = [MAJOR, MINOR, TINY].join('.')
   end

data/test/test_rule_tagger.rb CHANGED

@@ -117,14 +117,14 @@ TMZ.com: Britney celebrated getting overnights with her kids by going on a wild
   end
   def test_multiple_docs
-    timer = Time.now
+    #timer = Time.now
     count = 0
     Dir["#{File.dirname(__FILE__)}/docs/doc*"].each do|doc|
       tagger.tag( File.read( doc ) )
       count += 1
     end
-    duration = Time.now - timer
-    puts "time: #{duration} sec #{count.to_f/duration} docs/sec"
+    #duration = Time.now - timer
+    #puts "time: #{duration} sec #{count.to_f/duration} docs/sec"
   end
   def test_suggest
@@ -136,7 +136,7 @@ TMZ.com: Britney celebrated getting overnights with her kids by going on a wild
     assert results.include?(["Jamie Spears", "NNP", 12])
 #    puts results.inspect
     results = tagger.suggest( SAMPLE_DOC3, 5 )
-    puts results.inspect
+    #puts results.inspect
   end
   def test_adjectives

data/test/test_word_tagger.rb CHANGED

@@ -9,11 +9,11 @@ class TestWordTagger < Test::Unit::TestCase
   end
   def test_basic
-    timer = Time.now
+    #timer = Time.now
     text = "This is a sa'mple doc[]ument lets see how cancer ngrams 4 works out for this interesting text!"
     tags = $wtagger.execute( text )
     assert_equal ['cancer','work'], tags
-    puts "Duration: #{Time.now - timer} sec"
+    #puts "Duration: #{Time.now - timer} sec"
   end
   def test_sample_bug
@@ -25,11 +25,11 @@ class TestWordTagger < Test::Unit::TestCase
   end
   def test_ngram_size3
-    timer = Time.now
+    #timer = Time.now
     text = "This body of text contains something like ventricular septal defect"
     tags = $wtagger.execute( text )
     assert_equal ['ventricular septal defect'], tags
-    puts "Duration: #{Time.now - timer} sec"
+    #puts "Duration: #{Time.now - timer} sec"
   end
   def test_cat_and_the_hat
@@ -38,4 +38,10 @@ class TestWordTagger < Test::Unit::TestCase
     assert_equal( ["Cat", "hat"], tags )
   end
+  def test_freq_counts
+    tagger = Word::Tagger.new( ['Cat','hat'], :words => 4 )
+    tags = tagger.freq( 'the cAt and the hat the cAt and the hat the cAt and the hat the cAt and the hat' )
+    assert_equal( {"Cat"=>4, "hat"=>4}, tags )
+  end
 end

metadata CHANGED

@@ -1,7 +1,7 @@
 --- !ruby/object:Gem::Specification
 name: rbtagger
 version: !ruby/object:Gem::Version
-  version: 0.3.1
+  version: 0.3.2
 platform: ruby
 authors:
 - Todd A. Fisher
@@ -9,7 +9,7 @@ autorequire:
 bindir: bin
 cert_chain: []
-date: 2009-05-21 00:00:00 -04:00
+date: 2009-07-07 00:00:00 -04:00
 default_executable:
 dependencies:
 - !ruby/object:Gem::Dependency
@@ -20,7 +20,7 @@ dependencies:
     requirements:
     - - ">="
       - !ruby/object:Gem::Version
-        version: 1.8.0
+        version: 2.3.2
     version:
 description: A Simple Ruby Rule-Based Part of Speech Tagger
 email:
@@ -28,8 +28,8 @@ email:
 executables: []
 extensions:
-- ext/rule_tagger/extconf.rb
 - ext/word_tagger/extconf.rb
+- ext/rule_tagger/extconf.rb
 extra_rdoc_files:
 - History.txt
 - License.txt
@@ -37,17 +37,6 @@ extra_rdoc_files:
 - PostInstall.txt
 - README.txt
 - ext/word_tagger/test/doc.txt
-- test/docs/doc0.txt
-- test/docs/doc1.txt
-- test/docs/doc2.txt
-- test/docs/doc3.txt
-- test/docs/doc4.txt
-- test/docs/doc5.txt
-- test/docs/doc6.txt
-- test/docs/doc7.txt
-- test/docs/doc8.txt
-- test/docs/doc9.txt
-- test/fixtures/tags.txt
 - website/index.txt
 files:
 - COPYING
@@ -134,18 +123,16 @@ files:
 - website/template.html.erb
 has_rdoc: true
 homepage: http://rbtagger.rubyforge.org
-licenses: []
 post_install_message: |
   For more information on rbtagger, see http://rbtagger.rubyforge.org
 rdoc_options:
 - --main
-- README
+- README.txt
 require_paths:
 - lib
-- ext/rule_tagger
 - ext/word_tagger
+- ext/rule_tagger
 required_ruby_version: !ruby/object:Gem::Requirement
   requirements:
   - - ">="
@@ -161,11 +148,11 @@ required_rubygems_version: !ruby/object:Gem::Requirement
 requirements: []
 rubyforge_project: rbtagger
-rubygems_version: 1.3.2
+rubygems_version: 1.3.1
 signing_key:
-specification_version: 3
+specification_version: 2
 summary: A Simple Ruby Rule-Based Part of Speech Tagger
 test_files:
-- test/test_helper.rb
 - test/test_rule_tagger.rb
+- test/test_helper.rb
 - test/test_word_tagger.rb