rbtagger 0.3.1 → 0.3.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/ext/rule_tagger/extconf.rb +1 -1
- data/ext/rule_tagger/mkmf.log +4 -4
- data/ext/word_tagger/extconf.rb +1 -0
- data/ext/word_tagger/mkmf.log +2 -2
- data/ext/word_tagger/rtagger.cc +17 -2
- data/ext/word_tagger/tagger.cc +15 -11
- data/ext/word_tagger/tagger.h +8 -1
- data/lib/rbtagger/version.rb +1 -1
- data/test/test_rule_tagger.rb +4 -4
- data/test/test_word_tagger.rb +10 -4
- metadata +9 -22
data/ext/rule_tagger/extconf.rb
CHANGED
data/ext/rule_tagger/mkmf.log
CHANGED
@@ -1,6 +1,6 @@
|
|
1
1
|
have_header: checking for stdlib.h... -------------------- yes
|
2
2
|
|
3
|
-
"
|
3
|
+
"gcc -E -I. -I/usr/lib64/ruby/1.8/x86_64-linux -I. -O2 -g -pipe -Wall -Wp,-D_FORTIFY_SOURCE=2 -fexceptions -fstack-protector --param=ssp-buffer-size=4 -m64 -mtune=generic -fno-strict-aliasing -fPIC conftest.c -o conftest.i"
|
4
4
|
checked program was:
|
5
5
|
/* begin */
|
6
6
|
1: #include <stdlib.h>
|
@@ -10,7 +10,7 @@ checked program was:
|
|
10
10
|
|
11
11
|
have_header: checking for string.h... -------------------- yes
|
12
12
|
|
13
|
-
"
|
13
|
+
"gcc -E -I. -I/usr/lib64/ruby/1.8/x86_64-linux -I. -O2 -g -pipe -Wall -Wp,-D_FORTIFY_SOURCE=2 -fexceptions -fstack-protector --param=ssp-buffer-size=4 -m64 -mtune=generic -fno-strict-aliasing -fPIC conftest.c -o conftest.i"
|
14
14
|
checked program was:
|
15
15
|
/* begin */
|
16
16
|
1: #include <string.h>
|
@@ -20,7 +20,7 @@ checked program was:
|
|
20
20
|
|
21
21
|
have_library: checking for main() in -lc... -------------------- yes
|
22
22
|
|
23
|
-
"
|
23
|
+
"gcc -o conftest -I. -I/usr/lib64/ruby/1.8/x86_64-linux -I. -O2 -g -pipe -Wall -Wp,-D_FORTIFY_SOURCE=2 -fexceptions -fstack-protector --param=ssp-buffer-size=4 -m64 -mtune=generic -fno-strict-aliasing -fPIC conftest.c -L. -L/usr/lib64 -L. -rdynamic -Wl,-export-dynamic -lruby-static -lc -lpthread -ldl -lcrypt -lm -lc"
|
24
24
|
checked program was:
|
25
25
|
/* begin */
|
26
26
|
1: /*top*/
|
@@ -32,7 +32,7 @@ checked program was:
|
|
32
32
|
|
33
33
|
have_func: checking for snprintf() in stdio.h... -------------------- yes
|
34
34
|
|
35
|
-
"
|
35
|
+
"gcc -o conftest -I. -I/usr/lib64/ruby/1.8/x86_64-linux -I. -O2 -g -pipe -Wall -Wp,-D_FORTIFY_SOURCE=2 -fexceptions -fstack-protector --param=ssp-buffer-size=4 -m64 -mtune=generic -fno-strict-aliasing -fPIC conftest.c -L. -L/usr/lib64 -L. -rdynamic -Wl,-export-dynamic -lc -lruby-static -lc -lpthread -ldl -lcrypt -lm -lc"
|
36
36
|
checked program was:
|
37
37
|
/* begin */
|
38
38
|
1: #include <stdio.h>
|
data/ext/word_tagger/extconf.rb
CHANGED
data/ext/word_tagger/mkmf.log
CHANGED
@@ -1,6 +1,6 @@
|
|
1
1
|
have_library: checking for main() in -lc... -------------------- yes
|
2
2
|
|
3
|
-
"
|
3
|
+
"gcc -o conftest -I. -I/usr/lib64/ruby/1.8/x86_64-linux -I. -O2 -g -pipe -Wall -Wp,-D_FORTIFY_SOURCE=2 -fexceptions -fstack-protector --param=ssp-buffer-size=4 -m64 -mtune=generic -fno-strict-aliasing -fPIC -Wall conftest.c -L. -L/usr/lib64 -L. -rdynamic -Wl,-export-dynamic -lruby-static -lc -lpthread -ldl -lcrypt -lm -lc"
|
4
4
|
checked program was:
|
5
5
|
/* begin */
|
6
6
|
1: /*top*/
|
@@ -12,7 +12,7 @@ checked program was:
|
|
12
12
|
|
13
13
|
have_library: checking for main() in -lstdc++... -------------------- yes
|
14
14
|
|
15
|
-
"
|
15
|
+
"gcc -o conftest -I. -I/usr/lib64/ruby/1.8/x86_64-linux -I. -O2 -g -pipe -Wall -Wp,-D_FORTIFY_SOURCE=2 -fexceptions -fstack-protector --param=ssp-buffer-size=4 -m64 -mtune=generic -fno-strict-aliasing -fPIC -Wall conftest.c -L. -L/usr/lib64 -L. -rdynamic -Wl,-export-dynamic -lc -lruby-static -lstdc++ -lc -lpthread -ldl -lcrypt -lm -lc"
|
16
16
|
checked program was:
|
17
17
|
/* begin */
|
18
18
|
1: /*top*/
|
data/ext/word_tagger/rtagger.cc
CHANGED
@@ -28,13 +28,27 @@ VALUE Tagger_execute( VALUE self, VALUE text )
|
|
28
28
|
{
|
29
29
|
NWordTagger *tagger;
|
30
30
|
Data_Get_Struct( self, NWordTagger, tagger );
|
31
|
-
std::vector<std::string> tags
|
31
|
+
std::vector<std::string> tags;
|
32
|
+
tagger->execute( tags, RSTRING_PTR(text) );
|
32
33
|
VALUE results = rb_ary_new2(tags.size());
|
33
|
-
for( size_t i = 0; i < tags.size(); ++i ){
|
34
|
+
for( size_t i = 0; i < tags.size(); ++i ) {
|
34
35
|
rb_ary_push( results, rb_str_new( tags[i].c_str(), tags[i].length() ) );
|
35
36
|
}
|
36
37
|
return results;
|
37
38
|
}
|
39
|
+
VALUE Tagger_execute_freq( VALUE self, VALUE text )
|
40
|
+
{
|
41
|
+
NWordTagger *tagger;
|
42
|
+
Data_Get_Struct( self, NWordTagger, tagger );
|
43
|
+
int max_count = 0;
|
44
|
+
std::map<std::string,int> tags;
|
45
|
+
tagger->execute_with_frequency( RSTRING_PTR(text), tags, max_count );
|
46
|
+
VALUE results = rb_hash_new();
|
47
|
+
for( std::map<std::string,int>::const_iterator it = tags.begin(); it != tags.end(); ++it ) {
|
48
|
+
rb_hash_aset( results, rb_str_new(it->first.c_str(), it->first.length()), rb_int_new(it->second) );
|
49
|
+
}
|
50
|
+
return results;
|
51
|
+
}
|
38
52
|
VALUE Tagger_set_words( VALUE self, VALUE words )
|
39
53
|
{
|
40
54
|
NWordTagger *tagger;
|
@@ -79,5 +93,6 @@ extern "C" void Init_word_tagger()
|
|
79
93
|
|
80
94
|
rb_define_method( rb_NWordTagger, "load_tags", (VALUE (*)(...))Tagger_load_tags, 1 );
|
81
95
|
rb_define_method( rb_NWordTagger, "execute", (VALUE (*)(...))Tagger_execute, 1 );
|
96
|
+
rb_define_method( rb_NWordTagger, "freq", (VALUE (*)(...))Tagger_execute_freq, 1 );
|
82
97
|
rb_define_method( rb_NWordTagger, "set_words", (VALUE (*)(...))Tagger_set_words, 1 );
|
83
98
|
}
|
data/ext/word_tagger/tagger.cc
CHANGED
@@ -37,7 +37,7 @@ static std::vector<std::string> word_split(const std::string& s)
|
|
37
37
|
|
38
38
|
static void word_downcase( std::string &word )
|
39
39
|
{
|
40
|
-
for(
|
40
|
+
for( std::string::size_type j = 0; j < word.size(); ++j ) {
|
41
41
|
word[j] = tolower( word[j] );
|
42
42
|
}
|
43
43
|
}
|
@@ -82,11 +82,9 @@ std::string NWordTagger::stemWord( const std::string &word )const
|
|
82
82
|
return stemmed;
|
83
83
|
}
|
84
84
|
|
85
|
-
|
85
|
+
int NWordTagger::execute_with_frequency( const char *text, std::map<std::string,int> &matched_tags, int &max_count )const
|
86
86
|
{
|
87
|
-
int max_count = 0;
|
88
87
|
std::vector<std::string> words = word_split( text );
|
89
|
-
std::map<std::string, int> matched_tags; // stores tags and frequency
|
90
88
|
std::string match_word;
|
91
89
|
std::map<std::string,std::string>::const_iterator matched;
|
92
90
|
|
@@ -130,16 +128,23 @@ std::vector<std::string> NWordTagger::execute( const char *text, short max )cons
|
|
130
128
|
}
|
131
129
|
}
|
132
130
|
}
|
133
|
-
|
134
|
-
|
131
|
+
return matched_tags.size();
|
132
|
+
}
|
133
|
+
|
134
|
+
int NWordTagger::execute( std::vector<std::string> &reduced_tags, const char *text, unsigned short max )const
|
135
|
+
{
|
136
|
+
int max_count = 0;
|
137
|
+
std::map<std::string, int> matched_tags; // stores tags and frequency
|
135
138
|
|
139
|
+
execute_with_frequency(text, matched_tags, max_count);
|
140
|
+
|
136
141
|
// now we have a list of tags that match within the document text, check if we need to reduce the tags
|
137
142
|
if( matched_tags.size() < max ) {
|
138
143
|
// prepare the return vector
|
139
144
|
for( std::map<std::string, int>::iterator mloc = matched_tags.begin(); mloc != matched_tags.end(); ++mloc ){
|
140
145
|
reduced_tags.push_back( mloc->first );
|
141
146
|
}
|
142
|
-
return reduced_tags;
|
147
|
+
return reduced_tags.size();
|
143
148
|
}
|
144
149
|
|
145
150
|
// now that we have all the matched tags reduce to max using the tag frequency as a reduction measure
|
@@ -154,13 +159,12 @@ std::vector<std::string> NWordTagger::execute( const char *text, short max )cons
|
|
154
159
|
// sort the tags in frequency order
|
155
160
|
std::sort( sorted_tags.begin(), sorted_tags.end(), WordComparitor() );
|
156
161
|
|
157
|
-
|
158
162
|
std::vector< std::pair<std::string, int> >::iterator mloc;
|
159
163
|
do {
|
160
164
|
for(mloc = sorted_tags.begin(); mloc != sorted_tags.end(); ++mloc ) {
|
161
165
|
std::pair< std::string, int > word_freq = *mloc;
|
162
|
-
printf( "word: %s, frequency: %d\n", word_freq.first.c_str(), word_freq.second );
|
163
|
-
printf( "word: %s, frequency: %d\n", mloc->first.c_str(), mloc->second );
|
166
|
+
// printf( "word: %s, frequency: %d\n", word_freq.first.c_str(), word_freq.second );
|
167
|
+
// printf( "word: %s, frequency: %d\n", mloc->first.c_str(), mloc->second );
|
164
168
|
if( word_freq.second < max_count ) {
|
165
169
|
sorted_tags.erase( mloc );
|
166
170
|
break;
|
@@ -172,5 +176,5 @@ std::vector<std::string> NWordTagger::execute( const char *text, short max )cons
|
|
172
176
|
reduced_tags.push_back( sorted_tags[i].first );
|
173
177
|
}
|
174
178
|
|
175
|
-
return
|
179
|
+
return matched_tags.size();
|
176
180
|
}
|
data/ext/word_tagger/tagger.h
CHANGED
@@ -14,7 +14,14 @@ struct NWordTagger {
|
|
14
14
|
short getNWords()const{ return nwords; }
|
15
15
|
void setNWords( short words ){ nwords = words; }
|
16
16
|
|
17
|
-
|
17
|
+
// return the number of matched tags
|
18
|
+
// fill results with matching tags in the text body
|
19
|
+
// keep the number of tags returned within the threshold of max. reducing tags by least frequent
|
20
|
+
int execute( std::vector<std::string> &reduced_tags, const char *text, unsigned short max = 10 )const;
|
21
|
+
|
22
|
+
// return the number of matched tags
|
23
|
+
// result is updated with a mapping of matched tags with their individual term frequency count
|
24
|
+
int execute_with_frequency( const char *text, std::map<std::string,int> &matched_tags, int &max_count )const;
|
18
25
|
private:
|
19
26
|
short nwords;
|
20
27
|
struct stemmer *stemmer;
|
data/lib/rbtagger/version.rb
CHANGED
data/test/test_rule_tagger.rb
CHANGED
@@ -117,14 +117,14 @@ TMZ.com: Britney celebrated getting overnights with her kids by going on a wild
|
|
117
117
|
end
|
118
118
|
|
119
119
|
def test_multiple_docs
|
120
|
-
timer = Time.now
|
120
|
+
#timer = Time.now
|
121
121
|
count = 0
|
122
122
|
Dir["#{File.dirname(__FILE__)}/docs/doc*"].each do|doc|
|
123
123
|
tagger.tag( File.read( doc ) )
|
124
124
|
count += 1
|
125
125
|
end
|
126
|
-
duration = Time.now - timer
|
127
|
-
puts "time: #{duration} sec #{count.to_f/duration} docs/sec"
|
126
|
+
#duration = Time.now - timer
|
127
|
+
#puts "time: #{duration} sec #{count.to_f/duration} docs/sec"
|
128
128
|
end
|
129
129
|
|
130
130
|
def test_suggest
|
@@ -136,7 +136,7 @@ TMZ.com: Britney celebrated getting overnights with her kids by going on a wild
|
|
136
136
|
assert results.include?(["Jamie Spears", "NNP", 12])
|
137
137
|
# puts results.inspect
|
138
138
|
results = tagger.suggest( SAMPLE_DOC3, 5 )
|
139
|
-
puts results.inspect
|
139
|
+
#puts results.inspect
|
140
140
|
end
|
141
141
|
|
142
142
|
def test_adjectives
|
data/test/test_word_tagger.rb
CHANGED
@@ -9,11 +9,11 @@ class TestWordTagger < Test::Unit::TestCase
|
|
9
9
|
end
|
10
10
|
|
11
11
|
def test_basic
|
12
|
-
timer = Time.now
|
12
|
+
#timer = Time.now
|
13
13
|
text = "This is a sa'mple doc[]ument lets see how cancer ngrams 4 works out for this interesting text!"
|
14
14
|
tags = $wtagger.execute( text )
|
15
15
|
assert_equal ['cancer','work'], tags
|
16
|
-
puts "Duration: #{Time.now - timer} sec"
|
16
|
+
#puts "Duration: #{Time.now - timer} sec"
|
17
17
|
end
|
18
18
|
|
19
19
|
def test_sample_bug
|
@@ -25,11 +25,11 @@ class TestWordTagger < Test::Unit::TestCase
|
|
25
25
|
end
|
26
26
|
|
27
27
|
def test_ngram_size3
|
28
|
-
timer = Time.now
|
28
|
+
#timer = Time.now
|
29
29
|
text = "This body of text contains something like ventricular septal defect"
|
30
30
|
tags = $wtagger.execute( text )
|
31
31
|
assert_equal ['ventricular septal defect'], tags
|
32
|
-
puts "Duration: #{Time.now - timer} sec"
|
32
|
+
#puts "Duration: #{Time.now - timer} sec"
|
33
33
|
end
|
34
34
|
|
35
35
|
def test_cat_and_the_hat
|
@@ -38,4 +38,10 @@ class TestWordTagger < Test::Unit::TestCase
|
|
38
38
|
assert_equal( ["Cat", "hat"], tags )
|
39
39
|
end
|
40
40
|
|
41
|
+
def test_freq_counts
|
42
|
+
tagger = Word::Tagger.new( ['Cat','hat'], :words => 4 )
|
43
|
+
tags = tagger.freq( 'the cAt and the hat the cAt and the hat the cAt and the hat the cAt and the hat' )
|
44
|
+
assert_equal( {"Cat"=>4, "hat"=>4}, tags )
|
45
|
+
end
|
46
|
+
|
41
47
|
end
|
metadata
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: rbtagger
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.3.
|
4
|
+
version: 0.3.2
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Todd A. Fisher
|
@@ -9,7 +9,7 @@ autorequire:
|
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
11
|
|
12
|
-
date: 2009-
|
12
|
+
date: 2009-07-07 00:00:00 -04:00
|
13
13
|
default_executable:
|
14
14
|
dependencies:
|
15
15
|
- !ruby/object:Gem::Dependency
|
@@ -20,7 +20,7 @@ dependencies:
|
|
20
20
|
requirements:
|
21
21
|
- - ">="
|
22
22
|
- !ruby/object:Gem::Version
|
23
|
-
version:
|
23
|
+
version: 2.3.2
|
24
24
|
version:
|
25
25
|
description: A Simple Ruby Rule-Based Part of Speech Tagger
|
26
26
|
email:
|
@@ -28,8 +28,8 @@ email:
|
|
28
28
|
executables: []
|
29
29
|
|
30
30
|
extensions:
|
31
|
-
- ext/rule_tagger/extconf.rb
|
32
31
|
- ext/word_tagger/extconf.rb
|
32
|
+
- ext/rule_tagger/extconf.rb
|
33
33
|
extra_rdoc_files:
|
34
34
|
- History.txt
|
35
35
|
- License.txt
|
@@ -37,17 +37,6 @@ extra_rdoc_files:
|
|
37
37
|
- PostInstall.txt
|
38
38
|
- README.txt
|
39
39
|
- ext/word_tagger/test/doc.txt
|
40
|
-
- test/docs/doc0.txt
|
41
|
-
- test/docs/doc1.txt
|
42
|
-
- test/docs/doc2.txt
|
43
|
-
- test/docs/doc3.txt
|
44
|
-
- test/docs/doc4.txt
|
45
|
-
- test/docs/doc5.txt
|
46
|
-
- test/docs/doc6.txt
|
47
|
-
- test/docs/doc7.txt
|
48
|
-
- test/docs/doc8.txt
|
49
|
-
- test/docs/doc9.txt
|
50
|
-
- test/fixtures/tags.txt
|
51
40
|
- website/index.txt
|
52
41
|
files:
|
53
42
|
- COPYING
|
@@ -134,18 +123,16 @@ files:
|
|
134
123
|
- website/template.html.erb
|
135
124
|
has_rdoc: true
|
136
125
|
homepage: http://rbtagger.rubyforge.org
|
137
|
-
licenses: []
|
138
|
-
|
139
126
|
post_install_message: |
|
140
127
|
For more information on rbtagger, see http://rbtagger.rubyforge.org
|
141
128
|
|
142
129
|
rdoc_options:
|
143
130
|
- --main
|
144
|
-
- README
|
131
|
+
- README.txt
|
145
132
|
require_paths:
|
146
133
|
- lib
|
147
|
-
- ext/rule_tagger
|
148
134
|
- ext/word_tagger
|
135
|
+
- ext/rule_tagger
|
149
136
|
required_ruby_version: !ruby/object:Gem::Requirement
|
150
137
|
requirements:
|
151
138
|
- - ">="
|
@@ -161,11 +148,11 @@ required_rubygems_version: !ruby/object:Gem::Requirement
|
|
161
148
|
requirements: []
|
162
149
|
|
163
150
|
rubyforge_project: rbtagger
|
164
|
-
rubygems_version: 1.3.
|
151
|
+
rubygems_version: 1.3.1
|
165
152
|
signing_key:
|
166
|
-
specification_version:
|
153
|
+
specification_version: 2
|
167
154
|
summary: A Simple Ruby Rule-Based Part of Speech Tagger
|
168
155
|
test_files:
|
169
|
-
- test/test_helper.rb
|
170
156
|
- test/test_rule_tagger.rb
|
157
|
+
- test/test_helper.rb
|
171
158
|
- test/test_word_tagger.rb
|