rbtagger 0.2.8 → 0.2.9
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/ext/rule_tagger/mkmf.log +4 -4
- data/ext/word_tagger/mkmf.log +2 -2
- data/ext/word_tagger/tagger.cc +16 -5
- data/lib/rbtagger/version.rb +1 -1
- data/test/test_rule_tagger.rb +0 -5
- data/test/test_word_tagger.rb +8 -0
- metadata +5 -5
data/ext/rule_tagger/mkmf.log
CHANGED
@@ -1,6 +1,6 @@
|
|
1
1
|
have_header: checking for stdlib.h... -------------------- yes
|
2
2
|
|
3
|
-
"gcc -E -I. -I/
|
3
|
+
"gcc -E -I. -I/usr/lib64/ruby/1.8/x86_64-linux -I. -O2 -g -pipe -Wall -Wp,-D_FORTIFY_SOURCE=2 -fexceptions -fstack-protector --param=ssp-buffer-size=4 -m64 -mtune=generic -Wall -fPIC conftest.c -o conftest.i"
|
4
4
|
checked program was:
|
5
5
|
/* begin */
|
6
6
|
1: #include <stdlib.h>
|
@@ -10,7 +10,7 @@ checked program was:
|
|
10
10
|
|
11
11
|
have_header: checking for string.h... -------------------- yes
|
12
12
|
|
13
|
-
"gcc -E -I. -I/
|
13
|
+
"gcc -E -I. -I/usr/lib64/ruby/1.8/x86_64-linux -I. -O2 -g -pipe -Wall -Wp,-D_FORTIFY_SOURCE=2 -fexceptions -fstack-protector --param=ssp-buffer-size=4 -m64 -mtune=generic -Wall -fPIC conftest.c -o conftest.i"
|
14
14
|
checked program was:
|
15
15
|
/* begin */
|
16
16
|
1: #include <string.h>
|
@@ -20,7 +20,7 @@ checked program was:
|
|
20
20
|
|
21
21
|
have_library: checking for main() in -lc... -------------------- yes
|
22
22
|
|
23
|
-
"gcc -o conftest -I. -I/
|
23
|
+
"gcc -o conftest -I. -I/usr/lib64/ruby/1.8/x86_64-linux -I. -O2 -g -pipe -Wall -Wp,-D_FORTIFY_SOURCE=2 -fexceptions -fstack-protector --param=ssp-buffer-size=4 -m64 -mtune=generic -Wall -fPIC conftest.c -L. -L/usr/lib64 -L. -rdynamic -Wl,-export-dynamic -lruby-static -lc -lpthread -ldl -lcrypt -lm -lc"
|
24
24
|
checked program was:
|
25
25
|
/* begin */
|
26
26
|
1: /*top*/
|
@@ -32,7 +32,7 @@ checked program was:
|
|
32
32
|
|
33
33
|
have_func: checking for snprintf() in stdio.h... -------------------- yes
|
34
34
|
|
35
|
-
"gcc -o conftest -I. -I/
|
35
|
+
"gcc -o conftest -I. -I/usr/lib64/ruby/1.8/x86_64-linux -I. -O2 -g -pipe -Wall -Wp,-D_FORTIFY_SOURCE=2 -fexceptions -fstack-protector --param=ssp-buffer-size=4 -m64 -mtune=generic -Wall -fPIC conftest.c -L. -L/usr/lib64 -L. -rdynamic -Wl,-export-dynamic -lc -lruby-static -lc -lpthread -ldl -lcrypt -lm -lc"
|
36
36
|
checked program was:
|
37
37
|
/* begin */
|
38
38
|
1: #include <stdio.h>
|
data/ext/word_tagger/mkmf.log
CHANGED
@@ -1,6 +1,6 @@
|
|
1
1
|
have_library: checking for main() in -lc... -------------------- yes
|
2
2
|
|
3
|
-
"gcc -o conftest -I. -I/
|
3
|
+
"gcc -o conftest -I. -I/usr/lib64/ruby/1.8/x86_64-linux -I. -O2 -g -pipe -Wall -Wp,-D_FORTIFY_SOURCE=2 -fexceptions -fstack-protector --param=ssp-buffer-size=4 -m64 -mtune=generic -Wall -fPIC conftest.c -L. -L/usr/lib64 -L. -rdynamic -Wl,-export-dynamic -lruby-static -lc -lpthread -ldl -lcrypt -lm -lc"
|
4
4
|
checked program was:
|
5
5
|
/* begin */
|
6
6
|
1: /*top*/
|
@@ -12,7 +12,7 @@ checked program was:
|
|
12
12
|
|
13
13
|
have_library: checking for main() in -lstdc++... -------------------- yes
|
14
14
|
|
15
|
-
"gcc -o conftest -I. -I/
|
15
|
+
"gcc -o conftest -I. -I/usr/lib64/ruby/1.8/x86_64-linux -I. -O2 -g -pipe -Wall -Wp,-D_FORTIFY_SOURCE=2 -fexceptions -fstack-protector --param=ssp-buffer-size=4 -m64 -mtune=generic -Wall -fPIC conftest.c -L. -L/usr/lib64 -L. -rdynamic -Wl,-export-dynamic -lc -lruby-static -lstdc++ -lc -lpthread -ldl -lcrypt -lm -lc"
|
16
16
|
checked program was:
|
17
17
|
/* begin */
|
18
18
|
1: /*top*/
|
data/ext/word_tagger/tagger.cc
CHANGED
@@ -101,14 +101,15 @@ std::vector<std::string> NWordTagger::execute( const char *text, short max )cons
|
|
101
101
|
for( short j = 1; (j <= this->nwords) && ((i+j) < words.size()); ++j ) {
|
102
102
|
matched = this->tags.find( match_word );
|
103
103
|
if( matched != this->tags.end() ){
|
104
|
-
//printf( "word: %d:(%s->%s)\n", i, match_word.c_str(), matched->second.c_str() );
|
105
104
|
std::map<std::string, int>::iterator mloc = matched_tags.find( matched->second );
|
106
105
|
if( mloc == matched_tags.end() ) {
|
107
106
|
matched_tags[matched->second] = 1; // count 1
|
107
|
+
//printf( "word: %d:(%s->%s) %d, hits: 1\n", i, match_word.c_str(), matched->second.c_str(), j );
|
108
108
|
}
|
109
109
|
else {
|
110
110
|
mloc->second++;
|
111
111
|
if( max_count < mloc->second ) { max_count = mloc->second; }
|
112
|
+
//printf( "word: %d:(%s->%s) %d, hits: %d\n", i, match_word.c_str(), matched->second.c_str(), j, mloc->second );
|
112
113
|
}
|
113
114
|
}
|
114
115
|
// stem each word and compare against our tag bank
|
@@ -129,11 +130,22 @@ std::vector<std::string> NWordTagger::execute( const char *text, short max )cons
|
|
129
130
|
}
|
130
131
|
}
|
131
132
|
}
|
133
|
+
|
134
|
+
std::vector< std::string > reduced_tags;
|
135
|
+
|
136
|
+
// now we have a list of tags that match within the document text, check if we need to reduce the tags
|
137
|
+
if( matched_tags.size() < max ) {
|
138
|
+
// prepare the return vector
|
139
|
+
for( std::map<std::string, int>::iterator mloc = matched_tags.begin(); mloc != matched_tags.end(); ++mloc ){
|
140
|
+
reduced_tags.push_back( mloc->first );
|
141
|
+
}
|
142
|
+
return reduced_tags;
|
143
|
+
}
|
132
144
|
|
133
145
|
// now that we have all the matched tags reduce to max using the tag frequency as a reduction measure
|
134
146
|
std::vector< std::pair<std::string,int> > sorted_tags;
|
135
147
|
|
136
|
-
//printf( "max frequency: %d\n", max_count );
|
148
|
+
//printf( "max frequency: %d, total tagged: %d, reducing to %d\n", max_count, matched_tags.size(), max );
|
137
149
|
for( std::map<std::string, int>::iterator mloc = matched_tags.begin(); mloc != matched_tags.end(); ++mloc ){
|
138
150
|
//printf( "word: %s, frequency: %d\n", mloc->first.c_str(), mloc->second );
|
139
151
|
sorted_tags.push_back(*mloc);
|
@@ -142,14 +154,13 @@ std::vector<std::string> NWordTagger::execute( const char *text, short max )cons
|
|
142
154
|
// sort the tags in frequency order
|
143
155
|
std::sort( sorted_tags.begin(), sorted_tags.end(), WordComparitor() );
|
144
156
|
|
145
|
-
std::vector< std::string > reduced_tags;
|
146
157
|
|
147
158
|
std::vector< std::pair<std::string, int> >::iterator mloc;
|
148
159
|
do {
|
149
160
|
for(mloc = sorted_tags.begin(); mloc != sorted_tags.end(); ++mloc ) {
|
150
161
|
std::pair< std::string, int > word_freq = *mloc;
|
151
|
-
|
152
|
-
|
162
|
+
printf( "word: %s, frequency: %d\n", word_freq.first.c_str(), word_freq.second );
|
163
|
+
printf( "word: %s, frequency: %d\n", mloc->first.c_str(), mloc->second );
|
153
164
|
if( word_freq.second < max_count ) {
|
154
165
|
sorted_tags.erase( mloc );
|
155
166
|
break;
|
data/lib/rbtagger/version.rb
CHANGED
data/test/test_rule_tagger.rb
CHANGED
@@ -32,9 +32,6 @@ The details of her visitation, however, are unclear.
|
|
32
32
|
Asked by Us if she were happy with the court outcome, Spears (clutching an Ed Hardy purse) smiled and told Us, "Yes."
|
33
33
|
Next up: A status hearing set for July 15.
|
34
34
|
The couple last appeared in court May 6. Spears was granted extended visitation — three days a week from 9 a.m. to 5 p.m. — of Sean Preston, 2, and Jayden James, 20 months.
|
35
|
-
)
|
36
|
-
SAMPLE_DOC3=%q(
|
37
|
-
TMZ.com: Britney celebrated getting overnights with her kids by going on a wild shopping trip for herself.With L.A.'s finest at her service, it was a total clusterf**k outside of Fred Segal as Brit Brit made her way out. The scene was crazy -- and it was all... Read more
|
38
35
|
)
|
39
36
|
def setup
|
40
37
|
if !defined?($tagger)
|
@@ -137,8 +134,6 @@ TMZ.com: Britney celebrated getting overnights with her kids by going on a wild
|
|
137
134
|
assert results.include?(["Britney Spears", "NNP", 6])
|
138
135
|
assert results.include?(["Jamie Spears", "NNP", 12])
|
139
136
|
# puts results.inspect
|
140
|
-
results = tagger.suggest( SAMPLE_DOC3, 5 )
|
141
|
-
puts results.inspect
|
142
137
|
end
|
143
138
|
|
144
139
|
private
|
data/test/test_word_tagger.rb
CHANGED
@@ -16,6 +16,14 @@ class TestWordTagger < Test::Unit::TestCase
|
|
16
16
|
puts "Duration: #{Time.now - timer} sec"
|
17
17
|
end
|
18
18
|
|
19
|
+
def test_sample_bug
|
20
|
+
tags = ["foo", "bar", "baz", "squishy", "yummy"]
|
21
|
+
txt = 'This is some sample text. Foo walked into a bar. The bartender said "What can I get you?" Foo said he wanted something yummy - like a baz.'
|
22
|
+
tagger = Word::Tagger.new tags, :words => 4
|
23
|
+
result_tags = tagger.execute( txt )
|
24
|
+
assert_equal ["bar", "baz", "foo", "yummy"], result_tags
|
25
|
+
end
|
26
|
+
|
19
27
|
def test_ngram_size3
|
20
28
|
timer = Time.now
|
21
29
|
text = "This body of text contains something like ventricular septal defect"
|
metadata
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: rbtagger
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.2.
|
4
|
+
version: 0.2.9
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Todd A. Fisher
|
@@ -9,7 +9,7 @@ autorequire:
|
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
11
|
|
12
|
-
date: 2008-
|
12
|
+
date: 2008-08-28 00:00:00 -04:00
|
13
13
|
default_executable:
|
14
14
|
dependencies:
|
15
15
|
- !ruby/object:Gem::Dependency
|
@@ -28,8 +28,8 @@ email:
|
|
28
28
|
executables: []
|
29
29
|
|
30
30
|
extensions:
|
31
|
-
- ext/rule_tagger/extconf.rb
|
32
31
|
- ext/word_tagger/extconf.rb
|
32
|
+
- ext/rule_tagger/extconf.rb
|
33
33
|
extra_rdoc_files:
|
34
34
|
- History.txt
|
35
35
|
- License.txt
|
@@ -142,8 +142,8 @@ rdoc_options:
|
|
142
142
|
- README.txt
|
143
143
|
require_paths:
|
144
144
|
- lib
|
145
|
-
- ext/rule_tagger
|
146
145
|
- ext/word_tagger
|
146
|
+
- ext/rule_tagger
|
147
147
|
required_ruby_version: !ruby/object:Gem::Requirement
|
148
148
|
requirements:
|
149
149
|
- - ">="
|
@@ -164,6 +164,6 @@ signing_key:
|
|
164
164
|
specification_version: 2
|
165
165
|
summary: A Simple Ruby Rule-Based Part of Speech Tagger
|
166
166
|
test_files:
|
167
|
-
- test/test_helper.rb
|
168
167
|
- test/test_rule_tagger.rb
|
168
|
+
- test/test_helper.rb
|
169
169
|
- test/test_word_tagger.rb
|