rbtagger 0.2.8 → 0.2.9
Sign up to get free protection for your applications and to get access to all the features.
- data/ext/rule_tagger/mkmf.log +4 -4
- data/ext/word_tagger/mkmf.log +2 -2
- data/ext/word_tagger/tagger.cc +16 -5
- data/lib/rbtagger/version.rb +1 -1
- data/test/test_rule_tagger.rb +0 -5
- data/test/test_word_tagger.rb +8 -0
- metadata +5 -5
data/ext/rule_tagger/mkmf.log
CHANGED
@@ -1,6 +1,6 @@
|
|
1
1
|
have_header: checking for stdlib.h... -------------------- yes
|
2
2
|
|
3
|
-
"gcc -E -I. -I/
|
3
|
+
"gcc -E -I. -I/usr/lib64/ruby/1.8/x86_64-linux -I. -O2 -g -pipe -Wall -Wp,-D_FORTIFY_SOURCE=2 -fexceptions -fstack-protector --param=ssp-buffer-size=4 -m64 -mtune=generic -Wall -fPIC conftest.c -o conftest.i"
|
4
4
|
checked program was:
|
5
5
|
/* begin */
|
6
6
|
1: #include <stdlib.h>
|
@@ -10,7 +10,7 @@ checked program was:
|
|
10
10
|
|
11
11
|
have_header: checking for string.h... -------------------- yes
|
12
12
|
|
13
|
-
"gcc -E -I. -I/
|
13
|
+
"gcc -E -I. -I/usr/lib64/ruby/1.8/x86_64-linux -I. -O2 -g -pipe -Wall -Wp,-D_FORTIFY_SOURCE=2 -fexceptions -fstack-protector --param=ssp-buffer-size=4 -m64 -mtune=generic -Wall -fPIC conftest.c -o conftest.i"
|
14
14
|
checked program was:
|
15
15
|
/* begin */
|
16
16
|
1: #include <string.h>
|
@@ -20,7 +20,7 @@ checked program was:
|
|
20
20
|
|
21
21
|
have_library: checking for main() in -lc... -------------------- yes
|
22
22
|
|
23
|
-
"gcc -o conftest -I. -I/
|
23
|
+
"gcc -o conftest -I. -I/usr/lib64/ruby/1.8/x86_64-linux -I. -O2 -g -pipe -Wall -Wp,-D_FORTIFY_SOURCE=2 -fexceptions -fstack-protector --param=ssp-buffer-size=4 -m64 -mtune=generic -Wall -fPIC conftest.c -L. -L/usr/lib64 -L. -rdynamic -Wl,-export-dynamic -lruby-static -lc -lpthread -ldl -lcrypt -lm -lc"
|
24
24
|
checked program was:
|
25
25
|
/* begin */
|
26
26
|
1: /*top*/
|
@@ -32,7 +32,7 @@ checked program was:
|
|
32
32
|
|
33
33
|
have_func: checking for snprintf() in stdio.h... -------------------- yes
|
34
34
|
|
35
|
-
"gcc -o conftest -I. -I/
|
35
|
+
"gcc -o conftest -I. -I/usr/lib64/ruby/1.8/x86_64-linux -I. -O2 -g -pipe -Wall -Wp,-D_FORTIFY_SOURCE=2 -fexceptions -fstack-protector --param=ssp-buffer-size=4 -m64 -mtune=generic -Wall -fPIC conftest.c -L. -L/usr/lib64 -L. -rdynamic -Wl,-export-dynamic -lc -lruby-static -lc -lpthread -ldl -lcrypt -lm -lc"
|
36
36
|
checked program was:
|
37
37
|
/* begin */
|
38
38
|
1: #include <stdio.h>
|
data/ext/word_tagger/mkmf.log
CHANGED
@@ -1,6 +1,6 @@
|
|
1
1
|
have_library: checking for main() in -lc... -------------------- yes
|
2
2
|
|
3
|
-
"gcc -o conftest -I. -I/
|
3
|
+
"gcc -o conftest -I. -I/usr/lib64/ruby/1.8/x86_64-linux -I. -O2 -g -pipe -Wall -Wp,-D_FORTIFY_SOURCE=2 -fexceptions -fstack-protector --param=ssp-buffer-size=4 -m64 -mtune=generic -Wall -fPIC conftest.c -L. -L/usr/lib64 -L. -rdynamic -Wl,-export-dynamic -lruby-static -lc -lpthread -ldl -lcrypt -lm -lc"
|
4
4
|
checked program was:
|
5
5
|
/* begin */
|
6
6
|
1: /*top*/
|
@@ -12,7 +12,7 @@ checked program was:
|
|
12
12
|
|
13
13
|
have_library: checking for main() in -lstdc++... -------------------- yes
|
14
14
|
|
15
|
-
"gcc -o conftest -I. -I/
|
15
|
+
"gcc -o conftest -I. -I/usr/lib64/ruby/1.8/x86_64-linux -I. -O2 -g -pipe -Wall -Wp,-D_FORTIFY_SOURCE=2 -fexceptions -fstack-protector --param=ssp-buffer-size=4 -m64 -mtune=generic -Wall -fPIC conftest.c -L. -L/usr/lib64 -L. -rdynamic -Wl,-export-dynamic -lc -lruby-static -lstdc++ -lc -lpthread -ldl -lcrypt -lm -lc"
|
16
16
|
checked program was:
|
17
17
|
/* begin */
|
18
18
|
1: /*top*/
|
data/ext/word_tagger/tagger.cc
CHANGED
@@ -101,14 +101,15 @@ std::vector<std::string> NWordTagger::execute( const char *text, short max )cons
|
|
101
101
|
for( short j = 1; (j <= this->nwords) && ((i+j) < words.size()); ++j ) {
|
102
102
|
matched = this->tags.find( match_word );
|
103
103
|
if( matched != this->tags.end() ){
|
104
|
-
//printf( "word: %d:(%s->%s)\n", i, match_word.c_str(), matched->second.c_str() );
|
105
104
|
std::map<std::string, int>::iterator mloc = matched_tags.find( matched->second );
|
106
105
|
if( mloc == matched_tags.end() ) {
|
107
106
|
matched_tags[matched->second] = 1; // count 1
|
107
|
+
//printf( "word: %d:(%s->%s) %d, hits: 1\n", i, match_word.c_str(), matched->second.c_str(), j );
|
108
108
|
}
|
109
109
|
else {
|
110
110
|
mloc->second++;
|
111
111
|
if( max_count < mloc->second ) { max_count = mloc->second; }
|
112
|
+
//printf( "word: %d:(%s->%s) %d, hits: %d\n", i, match_word.c_str(), matched->second.c_str(), j, mloc->second );
|
112
113
|
}
|
113
114
|
}
|
114
115
|
// stem each word and compare against our tag bank
|
@@ -129,11 +130,22 @@ std::vector<std::string> NWordTagger::execute( const char *text, short max )cons
|
|
129
130
|
}
|
130
131
|
}
|
131
132
|
}
|
133
|
+
|
134
|
+
std::vector< std::string > reduced_tags;
|
135
|
+
|
136
|
+
// now we have a list of tags that match within the document text, check if we need to reduce the tags
|
137
|
+
if( matched_tags.size() < max ) {
|
138
|
+
// prepare the return vector
|
139
|
+
for( std::map<std::string, int>::iterator mloc = matched_tags.begin(); mloc != matched_tags.end(); ++mloc ){
|
140
|
+
reduced_tags.push_back( mloc->first );
|
141
|
+
}
|
142
|
+
return reduced_tags;
|
143
|
+
}
|
132
144
|
|
133
145
|
// now that we have all the matched tags reduce to max using the tag frequency as a reduction measure
|
134
146
|
std::vector< std::pair<std::string,int> > sorted_tags;
|
135
147
|
|
136
|
-
//printf( "max frequency: %d\n", max_count );
|
148
|
+
//printf( "max frequency: %d, total tagged: %d, reducing to %d\n", max_count, matched_tags.size(), max );
|
137
149
|
for( std::map<std::string, int>::iterator mloc = matched_tags.begin(); mloc != matched_tags.end(); ++mloc ){
|
138
150
|
//printf( "word: %s, frequency: %d\n", mloc->first.c_str(), mloc->second );
|
139
151
|
sorted_tags.push_back(*mloc);
|
@@ -142,14 +154,13 @@ std::vector<std::string> NWordTagger::execute( const char *text, short max )cons
|
|
142
154
|
// sort the tags in frequency order
|
143
155
|
std::sort( sorted_tags.begin(), sorted_tags.end(), WordComparitor() );
|
144
156
|
|
145
|
-
std::vector< std::string > reduced_tags;
|
146
157
|
|
147
158
|
std::vector< std::pair<std::string, int> >::iterator mloc;
|
148
159
|
do {
|
149
160
|
for(mloc = sorted_tags.begin(); mloc != sorted_tags.end(); ++mloc ) {
|
150
161
|
std::pair< std::string, int > word_freq = *mloc;
|
151
|
-
|
152
|
-
|
162
|
+
printf( "word: %s, frequency: %d\n", word_freq.first.c_str(), word_freq.second );
|
163
|
+
printf( "word: %s, frequency: %d\n", mloc->first.c_str(), mloc->second );
|
153
164
|
if( word_freq.second < max_count ) {
|
154
165
|
sorted_tags.erase( mloc );
|
155
166
|
break;
|
data/lib/rbtagger/version.rb
CHANGED
data/test/test_rule_tagger.rb
CHANGED
@@ -32,9 +32,6 @@ The details of her visitation, however, are unclear.
|
|
32
32
|
Asked by Us if she were happy with the court outcome, Spears (clutching an Ed Hardy purse) smiled and told Us, "Yes."
|
33
33
|
Next up: A status hearing set for July 15.
|
34
34
|
The couple last appeared in court May 6. Spears was granted extended visitation — three days a week from 9 a.m. to 5 p.m. — of Sean Preston, 2, and Jayden James, 20 months.
|
35
|
-
)
|
36
|
-
SAMPLE_DOC3=%q(
|
37
|
-
TMZ.com: Britney celebrated getting overnights with her kids by going on a wild shopping trip for herself.With L.A.'s finest at her service, it was a total clusterf**k outside of Fred Segal as Brit Brit made her way out. The scene was crazy -- and it was all... Read more
|
38
35
|
)
|
39
36
|
def setup
|
40
37
|
if !defined?($tagger)
|
@@ -137,8 +134,6 @@ TMZ.com: Britney celebrated getting overnights with her kids by going on a wild
|
|
137
134
|
assert results.include?(["Britney Spears", "NNP", 6])
|
138
135
|
assert results.include?(["Jamie Spears", "NNP", 12])
|
139
136
|
# puts results.inspect
|
140
|
-
results = tagger.suggest( SAMPLE_DOC3, 5 )
|
141
|
-
puts results.inspect
|
142
137
|
end
|
143
138
|
|
144
139
|
private
|
data/test/test_word_tagger.rb
CHANGED
@@ -16,6 +16,14 @@ class TestWordTagger < Test::Unit::TestCase
|
|
16
16
|
puts "Duration: #{Time.now - timer} sec"
|
17
17
|
end
|
18
18
|
|
19
|
+
def test_sample_bug
|
20
|
+
tags = ["foo", "bar", "baz", "squishy", "yummy"]
|
21
|
+
txt = 'This is some sample text. Foo walked into a bar. The bartender said "What can I get you?" Foo said he wanted something yummy - like a baz.'
|
22
|
+
tagger = Word::Tagger.new tags, :words => 4
|
23
|
+
result_tags = tagger.execute( txt )
|
24
|
+
assert_equal ["bar", "baz", "foo", "yummy"], result_tags
|
25
|
+
end
|
26
|
+
|
19
27
|
def test_ngram_size3
|
20
28
|
timer = Time.now
|
21
29
|
text = "This body of text contains something like ventricular septal defect"
|
metadata
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: rbtagger
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.2.
|
4
|
+
version: 0.2.9
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Todd A. Fisher
|
@@ -9,7 +9,7 @@ autorequire:
|
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
11
|
|
12
|
-
date: 2008-
|
12
|
+
date: 2008-08-28 00:00:00 -04:00
|
13
13
|
default_executable:
|
14
14
|
dependencies:
|
15
15
|
- !ruby/object:Gem::Dependency
|
@@ -28,8 +28,8 @@ email:
|
|
28
28
|
executables: []
|
29
29
|
|
30
30
|
extensions:
|
31
|
-
- ext/rule_tagger/extconf.rb
|
32
31
|
- ext/word_tagger/extconf.rb
|
32
|
+
- ext/rule_tagger/extconf.rb
|
33
33
|
extra_rdoc_files:
|
34
34
|
- History.txt
|
35
35
|
- License.txt
|
@@ -142,8 +142,8 @@ rdoc_options:
|
|
142
142
|
- README.txt
|
143
143
|
require_paths:
|
144
144
|
- lib
|
145
|
-
- ext/rule_tagger
|
146
145
|
- ext/word_tagger
|
146
|
+
- ext/rule_tagger
|
147
147
|
required_ruby_version: !ruby/object:Gem::Requirement
|
148
148
|
requirements:
|
149
149
|
- - ">="
|
@@ -164,6 +164,6 @@ signing_key:
|
|
164
164
|
specification_version: 2
|
165
165
|
summary: A Simple Ruby Rule-Based Part of Speech Tagger
|
166
166
|
test_files:
|
167
|
-
- test/test_helper.rb
|
168
167
|
- test/test_rule_tagger.rb
|
168
|
+
- test/test_helper.rb
|
169
169
|
- test/test_word_tagger.rb
|