rbtagger 0.3.1 → 0.3.2
Sign up to get free protection for your applications and to get access to all the features.
- data/ext/rule_tagger/extconf.rb +1 -1
- data/ext/rule_tagger/mkmf.log +4 -4
- data/ext/word_tagger/extconf.rb +1 -0
- data/ext/word_tagger/mkmf.log +2 -2
- data/ext/word_tagger/rtagger.cc +17 -2
- data/ext/word_tagger/tagger.cc +15 -11
- data/ext/word_tagger/tagger.h +8 -1
- data/lib/rbtagger/version.rb +1 -1
- data/test/test_rule_tagger.rb +4 -4
- data/test/test_word_tagger.rb +10 -4
- metadata +9 -22
data/ext/rule_tagger/extconf.rb
CHANGED
data/ext/rule_tagger/mkmf.log
CHANGED
@@ -1,6 +1,6 @@
|
|
1
1
|
have_header: checking for stdlib.h... -------------------- yes
|
2
2
|
|
3
|
-
"
|
3
|
+
"gcc -E -I. -I/usr/lib64/ruby/1.8/x86_64-linux -I. -O2 -g -pipe -Wall -Wp,-D_FORTIFY_SOURCE=2 -fexceptions -fstack-protector --param=ssp-buffer-size=4 -m64 -mtune=generic -fno-strict-aliasing -fPIC conftest.c -o conftest.i"
|
4
4
|
checked program was:
|
5
5
|
/* begin */
|
6
6
|
1: #include <stdlib.h>
|
@@ -10,7 +10,7 @@ checked program was:
|
|
10
10
|
|
11
11
|
have_header: checking for string.h... -------------------- yes
|
12
12
|
|
13
|
-
"
|
13
|
+
"gcc -E -I. -I/usr/lib64/ruby/1.8/x86_64-linux -I. -O2 -g -pipe -Wall -Wp,-D_FORTIFY_SOURCE=2 -fexceptions -fstack-protector --param=ssp-buffer-size=4 -m64 -mtune=generic -fno-strict-aliasing -fPIC conftest.c -o conftest.i"
|
14
14
|
checked program was:
|
15
15
|
/* begin */
|
16
16
|
1: #include <string.h>
|
@@ -20,7 +20,7 @@ checked program was:
|
|
20
20
|
|
21
21
|
have_library: checking for main() in -lc... -------------------- yes
|
22
22
|
|
23
|
-
"
|
23
|
+
"gcc -o conftest -I. -I/usr/lib64/ruby/1.8/x86_64-linux -I. -O2 -g -pipe -Wall -Wp,-D_FORTIFY_SOURCE=2 -fexceptions -fstack-protector --param=ssp-buffer-size=4 -m64 -mtune=generic -fno-strict-aliasing -fPIC conftest.c -L. -L/usr/lib64 -L. -rdynamic -Wl,-export-dynamic -lruby-static -lc -lpthread -ldl -lcrypt -lm -lc"
|
24
24
|
checked program was:
|
25
25
|
/* begin */
|
26
26
|
1: /*top*/
|
@@ -32,7 +32,7 @@ checked program was:
|
|
32
32
|
|
33
33
|
have_func: checking for snprintf() in stdio.h... -------------------- yes
|
34
34
|
|
35
|
-
"
|
35
|
+
"gcc -o conftest -I. -I/usr/lib64/ruby/1.8/x86_64-linux -I. -O2 -g -pipe -Wall -Wp,-D_FORTIFY_SOURCE=2 -fexceptions -fstack-protector --param=ssp-buffer-size=4 -m64 -mtune=generic -fno-strict-aliasing -fPIC conftest.c -L. -L/usr/lib64 -L. -rdynamic -Wl,-export-dynamic -lc -lruby-static -lc -lpthread -ldl -lcrypt -lm -lc"
|
36
36
|
checked program was:
|
37
37
|
/* begin */
|
38
38
|
1: #include <stdio.h>
|
data/ext/word_tagger/extconf.rb
CHANGED
data/ext/word_tagger/mkmf.log
CHANGED
@@ -1,6 +1,6 @@
|
|
1
1
|
have_library: checking for main() in -lc... -------------------- yes
|
2
2
|
|
3
|
-
"
|
3
|
+
"gcc -o conftest -I. -I/usr/lib64/ruby/1.8/x86_64-linux -I. -O2 -g -pipe -Wall -Wp,-D_FORTIFY_SOURCE=2 -fexceptions -fstack-protector --param=ssp-buffer-size=4 -m64 -mtune=generic -fno-strict-aliasing -fPIC -Wall conftest.c -L. -L/usr/lib64 -L. -rdynamic -Wl,-export-dynamic -lruby-static -lc -lpthread -ldl -lcrypt -lm -lc"
|
4
4
|
checked program was:
|
5
5
|
/* begin */
|
6
6
|
1: /*top*/
|
@@ -12,7 +12,7 @@ checked program was:
|
|
12
12
|
|
13
13
|
have_library: checking for main() in -lstdc++... -------------------- yes
|
14
14
|
|
15
|
-
"
|
15
|
+
"gcc -o conftest -I. -I/usr/lib64/ruby/1.8/x86_64-linux -I. -O2 -g -pipe -Wall -Wp,-D_FORTIFY_SOURCE=2 -fexceptions -fstack-protector --param=ssp-buffer-size=4 -m64 -mtune=generic -fno-strict-aliasing -fPIC -Wall conftest.c -L. -L/usr/lib64 -L. -rdynamic -Wl,-export-dynamic -lc -lruby-static -lstdc++ -lc -lpthread -ldl -lcrypt -lm -lc"
|
16
16
|
checked program was:
|
17
17
|
/* begin */
|
18
18
|
1: /*top*/
|
data/ext/word_tagger/rtagger.cc
CHANGED
@@ -28,13 +28,27 @@ VALUE Tagger_execute( VALUE self, VALUE text )
|
|
28
28
|
{
|
29
29
|
NWordTagger *tagger;
|
30
30
|
Data_Get_Struct( self, NWordTagger, tagger );
|
31
|
-
std::vector<std::string> tags
|
31
|
+
std::vector<std::string> tags;
|
32
|
+
tagger->execute( tags, RSTRING_PTR(text) );
|
32
33
|
VALUE results = rb_ary_new2(tags.size());
|
33
|
-
for( size_t i = 0; i < tags.size(); ++i ){
|
34
|
+
for( size_t i = 0; i < tags.size(); ++i ) {
|
34
35
|
rb_ary_push( results, rb_str_new( tags[i].c_str(), tags[i].length() ) );
|
35
36
|
}
|
36
37
|
return results;
|
37
38
|
}
|
39
|
+
VALUE Tagger_execute_freq( VALUE self, VALUE text )
|
40
|
+
{
|
41
|
+
NWordTagger *tagger;
|
42
|
+
Data_Get_Struct( self, NWordTagger, tagger );
|
43
|
+
int max_count = 0;
|
44
|
+
std::map<std::string,int> tags;
|
45
|
+
tagger->execute_with_frequency( RSTRING_PTR(text), tags, max_count );
|
46
|
+
VALUE results = rb_hash_new();
|
47
|
+
for( std::map<std::string,int>::const_iterator it = tags.begin(); it != tags.end(); ++it ) {
|
48
|
+
rb_hash_aset( results, rb_str_new(it->first.c_str(), it->first.length()), rb_int_new(it->second) );
|
49
|
+
}
|
50
|
+
return results;
|
51
|
+
}
|
38
52
|
VALUE Tagger_set_words( VALUE self, VALUE words )
|
39
53
|
{
|
40
54
|
NWordTagger *tagger;
|
@@ -79,5 +93,6 @@ extern "C" void Init_word_tagger()
|
|
79
93
|
|
80
94
|
rb_define_method( rb_NWordTagger, "load_tags", (VALUE (*)(...))Tagger_load_tags, 1 );
|
81
95
|
rb_define_method( rb_NWordTagger, "execute", (VALUE (*)(...))Tagger_execute, 1 );
|
96
|
+
rb_define_method( rb_NWordTagger, "freq", (VALUE (*)(...))Tagger_execute_freq, 1 );
|
82
97
|
rb_define_method( rb_NWordTagger, "set_words", (VALUE (*)(...))Tagger_set_words, 1 );
|
83
98
|
}
|
data/ext/word_tagger/tagger.cc
CHANGED
@@ -37,7 +37,7 @@ static std::vector<std::string> word_split(const std::string& s)
|
|
37
37
|
|
38
38
|
static void word_downcase( std::string &word )
|
39
39
|
{
|
40
|
-
for(
|
40
|
+
for( std::string::size_type j = 0; j < word.size(); ++j ) {
|
41
41
|
word[j] = tolower( word[j] );
|
42
42
|
}
|
43
43
|
}
|
@@ -82,11 +82,9 @@ std::string NWordTagger::stemWord( const std::string &word )const
|
|
82
82
|
return stemmed;
|
83
83
|
}
|
84
84
|
|
85
|
-
|
85
|
+
int NWordTagger::execute_with_frequency( const char *text, std::map<std::string,int> &matched_tags, int &max_count )const
|
86
86
|
{
|
87
|
-
int max_count = 0;
|
88
87
|
std::vector<std::string> words = word_split( text );
|
89
|
-
std::map<std::string, int> matched_tags; // stores tags and frequency
|
90
88
|
std::string match_word;
|
91
89
|
std::map<std::string,std::string>::const_iterator matched;
|
92
90
|
|
@@ -130,16 +128,23 @@ std::vector<std::string> NWordTagger::execute( const char *text, short max )cons
|
|
130
128
|
}
|
131
129
|
}
|
132
130
|
}
|
133
|
-
|
134
|
-
|
131
|
+
return matched_tags.size();
|
132
|
+
}
|
133
|
+
|
134
|
+
int NWordTagger::execute( std::vector<std::string> &reduced_tags, const char *text, unsigned short max )const
|
135
|
+
{
|
136
|
+
int max_count = 0;
|
137
|
+
std::map<std::string, int> matched_tags; // stores tags and frequency
|
135
138
|
|
139
|
+
execute_with_frequency(text, matched_tags, max_count);
|
140
|
+
|
136
141
|
// now we have a list of tags that match within the document text, check if we need to reduce the tags
|
137
142
|
if( matched_tags.size() < max ) {
|
138
143
|
// prepare the return vector
|
139
144
|
for( std::map<std::string, int>::iterator mloc = matched_tags.begin(); mloc != matched_tags.end(); ++mloc ){
|
140
145
|
reduced_tags.push_back( mloc->first );
|
141
146
|
}
|
142
|
-
return reduced_tags;
|
147
|
+
return reduced_tags.size();
|
143
148
|
}
|
144
149
|
|
145
150
|
// now that we have all the matched tags reduce to max using the tag frequency as a reduction measure
|
@@ -154,13 +159,12 @@ std::vector<std::string> NWordTagger::execute( const char *text, short max )cons
|
|
154
159
|
// sort the tags in frequency order
|
155
160
|
std::sort( sorted_tags.begin(), sorted_tags.end(), WordComparitor() );
|
156
161
|
|
157
|
-
|
158
162
|
std::vector< std::pair<std::string, int> >::iterator mloc;
|
159
163
|
do {
|
160
164
|
for(mloc = sorted_tags.begin(); mloc != sorted_tags.end(); ++mloc ) {
|
161
165
|
std::pair< std::string, int > word_freq = *mloc;
|
162
|
-
printf( "word: %s, frequency: %d\n", word_freq.first.c_str(), word_freq.second );
|
163
|
-
printf( "word: %s, frequency: %d\n", mloc->first.c_str(), mloc->second );
|
166
|
+
// printf( "word: %s, frequency: %d\n", word_freq.first.c_str(), word_freq.second );
|
167
|
+
// printf( "word: %s, frequency: %d\n", mloc->first.c_str(), mloc->second );
|
164
168
|
if( word_freq.second < max_count ) {
|
165
169
|
sorted_tags.erase( mloc );
|
166
170
|
break;
|
@@ -172,5 +176,5 @@ std::vector<std::string> NWordTagger::execute( const char *text, short max )cons
|
|
172
176
|
reduced_tags.push_back( sorted_tags[i].first );
|
173
177
|
}
|
174
178
|
|
175
|
-
return
|
179
|
+
return matched_tags.size();
|
176
180
|
}
|
data/ext/word_tagger/tagger.h
CHANGED
@@ -14,7 +14,14 @@ struct NWordTagger {
|
|
14
14
|
short getNWords()const{ return nwords; }
|
15
15
|
void setNWords( short words ){ nwords = words; }
|
16
16
|
|
17
|
-
|
17
|
+
// return the number of matched tags
|
18
|
+
// fill results with matching tags in the text body
|
19
|
+
// keep the number of tags returned within the threshold of max. reducing tags by least frequent
|
20
|
+
int execute( std::vector<std::string> &reduced_tags, const char *text, unsigned short max = 10 )const;
|
21
|
+
|
22
|
+
// return the number of matched tags
|
23
|
+
// result is updated with a mapping of matched tags with their individual term frequency count
|
24
|
+
int execute_with_frequency( const char *text, std::map<std::string,int> &matched_tags, int &max_count )const;
|
18
25
|
private:
|
19
26
|
short nwords;
|
20
27
|
struct stemmer *stemmer;
|
data/lib/rbtagger/version.rb
CHANGED
data/test/test_rule_tagger.rb
CHANGED
@@ -117,14 +117,14 @@ TMZ.com: Britney celebrated getting overnights with her kids by going on a wild
|
|
117
117
|
end
|
118
118
|
|
119
119
|
def test_multiple_docs
|
120
|
-
timer = Time.now
|
120
|
+
#timer = Time.now
|
121
121
|
count = 0
|
122
122
|
Dir["#{File.dirname(__FILE__)}/docs/doc*"].each do|doc|
|
123
123
|
tagger.tag( File.read( doc ) )
|
124
124
|
count += 1
|
125
125
|
end
|
126
|
-
duration = Time.now - timer
|
127
|
-
puts "time: #{duration} sec #{count.to_f/duration} docs/sec"
|
126
|
+
#duration = Time.now - timer
|
127
|
+
#puts "time: #{duration} sec #{count.to_f/duration} docs/sec"
|
128
128
|
end
|
129
129
|
|
130
130
|
def test_suggest
|
@@ -136,7 +136,7 @@ TMZ.com: Britney celebrated getting overnights with her kids by going on a wild
|
|
136
136
|
assert results.include?(["Jamie Spears", "NNP", 12])
|
137
137
|
# puts results.inspect
|
138
138
|
results = tagger.suggest( SAMPLE_DOC3, 5 )
|
139
|
-
puts results.inspect
|
139
|
+
#puts results.inspect
|
140
140
|
end
|
141
141
|
|
142
142
|
def test_adjectives
|
data/test/test_word_tagger.rb
CHANGED
@@ -9,11 +9,11 @@ class TestWordTagger < Test::Unit::TestCase
|
|
9
9
|
end
|
10
10
|
|
11
11
|
def test_basic
|
12
|
-
timer = Time.now
|
12
|
+
#timer = Time.now
|
13
13
|
text = "This is a sa'mple doc[]ument lets see how cancer ngrams 4 works out for this interesting text!"
|
14
14
|
tags = $wtagger.execute( text )
|
15
15
|
assert_equal ['cancer','work'], tags
|
16
|
-
puts "Duration: #{Time.now - timer} sec"
|
16
|
+
#puts "Duration: #{Time.now - timer} sec"
|
17
17
|
end
|
18
18
|
|
19
19
|
def test_sample_bug
|
@@ -25,11 +25,11 @@ class TestWordTagger < Test::Unit::TestCase
|
|
25
25
|
end
|
26
26
|
|
27
27
|
def test_ngram_size3
|
28
|
-
timer = Time.now
|
28
|
+
#timer = Time.now
|
29
29
|
text = "This body of text contains something like ventricular septal defect"
|
30
30
|
tags = $wtagger.execute( text )
|
31
31
|
assert_equal ['ventricular septal defect'], tags
|
32
|
-
puts "Duration: #{Time.now - timer} sec"
|
32
|
+
#puts "Duration: #{Time.now - timer} sec"
|
33
33
|
end
|
34
34
|
|
35
35
|
def test_cat_and_the_hat
|
@@ -38,4 +38,10 @@ class TestWordTagger < Test::Unit::TestCase
|
|
38
38
|
assert_equal( ["Cat", "hat"], tags )
|
39
39
|
end
|
40
40
|
|
41
|
+
def test_freq_counts
|
42
|
+
tagger = Word::Tagger.new( ['Cat','hat'], :words => 4 )
|
43
|
+
tags = tagger.freq( 'the cAt and the hat the cAt and the hat the cAt and the hat the cAt and the hat' )
|
44
|
+
assert_equal( {"Cat"=>4, "hat"=>4}, tags )
|
45
|
+
end
|
46
|
+
|
41
47
|
end
|
metadata
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: rbtagger
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.3.
|
4
|
+
version: 0.3.2
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Todd A. Fisher
|
@@ -9,7 +9,7 @@ autorequire:
|
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
11
|
|
12
|
-
date: 2009-
|
12
|
+
date: 2009-07-07 00:00:00 -04:00
|
13
13
|
default_executable:
|
14
14
|
dependencies:
|
15
15
|
- !ruby/object:Gem::Dependency
|
@@ -20,7 +20,7 @@ dependencies:
|
|
20
20
|
requirements:
|
21
21
|
- - ">="
|
22
22
|
- !ruby/object:Gem::Version
|
23
|
-
version:
|
23
|
+
version: 2.3.2
|
24
24
|
version:
|
25
25
|
description: A Simple Ruby Rule-Based Part of Speech Tagger
|
26
26
|
email:
|
@@ -28,8 +28,8 @@ email:
|
|
28
28
|
executables: []
|
29
29
|
|
30
30
|
extensions:
|
31
|
-
- ext/rule_tagger/extconf.rb
|
32
31
|
- ext/word_tagger/extconf.rb
|
32
|
+
- ext/rule_tagger/extconf.rb
|
33
33
|
extra_rdoc_files:
|
34
34
|
- History.txt
|
35
35
|
- License.txt
|
@@ -37,17 +37,6 @@ extra_rdoc_files:
|
|
37
37
|
- PostInstall.txt
|
38
38
|
- README.txt
|
39
39
|
- ext/word_tagger/test/doc.txt
|
40
|
-
- test/docs/doc0.txt
|
41
|
-
- test/docs/doc1.txt
|
42
|
-
- test/docs/doc2.txt
|
43
|
-
- test/docs/doc3.txt
|
44
|
-
- test/docs/doc4.txt
|
45
|
-
- test/docs/doc5.txt
|
46
|
-
- test/docs/doc6.txt
|
47
|
-
- test/docs/doc7.txt
|
48
|
-
- test/docs/doc8.txt
|
49
|
-
- test/docs/doc9.txt
|
50
|
-
- test/fixtures/tags.txt
|
51
40
|
- website/index.txt
|
52
41
|
files:
|
53
42
|
- COPYING
|
@@ -134,18 +123,16 @@ files:
|
|
134
123
|
- website/template.html.erb
|
135
124
|
has_rdoc: true
|
136
125
|
homepage: http://rbtagger.rubyforge.org
|
137
|
-
licenses: []
|
138
|
-
|
139
126
|
post_install_message: |
|
140
127
|
For more information on rbtagger, see http://rbtagger.rubyforge.org
|
141
128
|
|
142
129
|
rdoc_options:
|
143
130
|
- --main
|
144
|
-
- README
|
131
|
+
- README.txt
|
145
132
|
require_paths:
|
146
133
|
- lib
|
147
|
-
- ext/rule_tagger
|
148
134
|
- ext/word_tagger
|
135
|
+
- ext/rule_tagger
|
149
136
|
required_ruby_version: !ruby/object:Gem::Requirement
|
150
137
|
requirements:
|
151
138
|
- - ">="
|
@@ -161,11 +148,11 @@ required_rubygems_version: !ruby/object:Gem::Requirement
|
|
161
148
|
requirements: []
|
162
149
|
|
163
150
|
rubyforge_project: rbtagger
|
164
|
-
rubygems_version: 1.3.
|
151
|
+
rubygems_version: 1.3.1
|
165
152
|
signing_key:
|
166
|
-
specification_version:
|
153
|
+
specification_version: 2
|
167
154
|
summary: A Simple Ruby Rule-Based Part of Speech Tagger
|
168
155
|
test_files:
|
169
|
-
- test/test_helper.rb
|
170
156
|
- test/test_rule_tagger.rb
|
157
|
+
- test/test_helper.rb
|
171
158
|
- test/test_word_tagger.rb
|