rbtagger 0.0.1 → 0.2.0
Sign up to get free protection for your applications and to get access to all the features.
- data/Manifest.txt +9 -2
- data/Rakefile +0 -29
- data/config/hoe.rb +2 -2
- data/ext/rule_tagger/mkmf.log +46 -0
- data/ext/word_tagger/mkmf.log +24 -0
- data/ext/word_tagger/rtagger.cc +2 -2
- data/ext/word_tagger/tagger.cc +14 -4
- data/lib/rbtagger/version.rb +2 -2
- data/lib/rbtagger.rb +8 -1
- data/lib/word/tagger.rb +18 -0
- data/script/txt2html +1 -1
- data/tasks/extconf/rule_tagger.rake +43 -0
- data/tasks/extconf/word_tagger.rake +43 -0
- data/tasks/extconf.rake +18 -0
- data/test/fixtures/tags.txt +976 -0
- data/test/{tagger_test.rb → test_rule_tagger.rb} +15 -14
- data/test/test_word_tagger.rb +33 -0
- data/website/index.html +12 -2
- data/website/index.txt +11 -1
- metadata +16 -6
- data/ext/word_tagger/tagger.rb +0 -8
data/Manifest.txt
CHANGED
@@ -18,6 +18,7 @@ ext/rule_tagger/lex.c
|
|
18
18
|
ext/rule_tagger/lex.h
|
19
19
|
ext/rule_tagger/memory.c
|
20
20
|
ext/rule_tagger/memory.h
|
21
|
+
ext/rule_tagger/mkmf.log
|
21
22
|
ext/rule_tagger/rbtagger.c
|
22
23
|
ext/rule_tagger/registry.c
|
23
24
|
ext/rule_tagger/registry.h
|
@@ -31,12 +32,12 @@ ext/rule_tagger/tagger.h
|
|
31
32
|
ext/rule_tagger/useful.c
|
32
33
|
ext/rule_tagger/useful.h
|
33
34
|
ext/word_tagger/extconf.rb
|
35
|
+
ext/word_tagger/mkmf.log
|
34
36
|
ext/word_tagger/porter_stemmer.c
|
35
37
|
ext/word_tagger/porter_stemmer.h
|
36
38
|
ext/word_tagger/rtagger.cc
|
37
39
|
ext/word_tagger/tagger.cc
|
38
40
|
ext/word_tagger/tagger.h
|
39
|
-
ext/word_tagger/tagger.rb
|
40
41
|
ext/word_tagger/test.rb
|
41
42
|
ext/word_tagger/test/Makefile
|
42
43
|
ext/word_tagger/test/doc.txt
|
@@ -44,6 +45,7 @@ ext/word_tagger/test/test.cc
|
|
44
45
|
lib/brill/tagger.rb
|
45
46
|
lib/rbtagger.rb
|
46
47
|
lib/rbtagger/version.rb
|
48
|
+
lib/word/tagger.rb
|
47
49
|
script/console
|
48
50
|
script/destroy
|
49
51
|
script/generate
|
@@ -51,6 +53,9 @@ script/txt2html
|
|
51
53
|
setup.rb
|
52
54
|
tasks/deployment.rake
|
53
55
|
tasks/environment.rake
|
56
|
+
tasks/extconf.rake
|
57
|
+
tasks/extconf/rule_tagger.rake
|
58
|
+
tasks/extconf/word_tagger.rake
|
54
59
|
tasks/website.rake
|
55
60
|
test/CONTEXTUALRULEFILE
|
56
61
|
test/LEXICALRULEFILE
|
@@ -65,8 +70,10 @@ test/docs/doc6.txt
|
|
65
70
|
test/docs/doc7.txt
|
66
71
|
test/docs/doc8.txt
|
67
72
|
test/docs/doc9.txt
|
68
|
-
test/
|
73
|
+
test/fixtures/tags.txt
|
69
74
|
test/test_helper.rb
|
75
|
+
test/test_rule_tagger.rb
|
76
|
+
test/test_word_tagger.rb
|
70
77
|
tools/rakehelp.rb
|
71
78
|
website/index.html
|
72
79
|
website/index.txt
|
data/Rakefile
CHANGED
@@ -2,32 +2,3 @@ require 'config/requirements'
|
|
2
2
|
require 'config/hoe' # setup Hoe + all gem configuration
|
3
3
|
|
4
4
|
Dir['tasks/**/*.rake'].each { |rake| load rake }
|
5
|
-
|
6
|
-
# redefine release
|
7
|
-
|
8
|
-
desc 'Package and upload the release to rubyforge.'
|
9
|
-
task :release_current => [:clean, :package] do |t|
|
10
|
-
require 'config/hoe'
|
11
|
-
version = ENV["VERSION"] or abort "Must supply VERSION=x.y.z"
|
12
|
-
name = $hoe.name
|
13
|
-
rubyforge_name = $hoe.rubyforge_name
|
14
|
-
description = $hoe.description
|
15
|
-
pkg = "pkg/#{name}-#{version}"
|
16
|
-
abort "Package doesn't exist => #{pkg}" if !File.exist?(pkg)
|
17
|
-
|
18
|
-
rf = RubyForge.new
|
19
|
-
puts "Logging in"
|
20
|
-
rf.login
|
21
|
-
|
22
|
-
c = rf.userconfig
|
23
|
-
c["release_notes"] = description if description
|
24
|
-
c["release_changes"] = $hoe.changes if $hoe.changes
|
25
|
-
c["preformatted"] = true
|
26
|
-
|
27
|
-
files = [(@need_tar ? "#{pkg}.tgz" : nil),
|
28
|
-
(@need_zip ? "#{pkg}.zip" : nil),
|
29
|
-
"#{pkg}.gem"].compact
|
30
|
-
|
31
|
-
puts "Releasing #{rubyforge_name} v. #{version}"
|
32
|
-
rf.add_release 'ruletagger', 'ruletagger', version, *files
|
33
|
-
end
|
data/config/hoe.rb
CHANGED
@@ -4,7 +4,7 @@ AUTHOR = 'Todd A. Fisher' # can also be an array of Authors
|
|
4
4
|
EMAIL = 'todd.fisher@gmail.com'
|
5
5
|
DESCRIPTION = "A Simple Ruby Rule-Based Part of Speech Tagger"
|
6
6
|
GEM_NAME = 'rbtagger' # what ppl will type to install your gem
|
7
|
-
RUBYFORGE_PROJECT = '
|
7
|
+
RUBYFORGE_PROJECT = 'rbtagger' # The unix name for your project
|
8
8
|
HOMEPATH = "http://#{RUBYFORGE_PROJECT}.rubyforge.org"
|
9
9
|
DOWNLOAD_PATH = "http://rubyforge.org/projects/#{RUBYFORGE_PROJECT}"
|
10
10
|
EXTRA_DEPENDENCIES = [
|
@@ -57,7 +57,7 @@ $hoe = Hoe.new(GEM_NAME, VERS) do |p|
|
|
57
57
|
p.description = DESCRIPTION
|
58
58
|
p.summary = DESCRIPTION
|
59
59
|
p.url = HOMEPATH
|
60
|
-
p.rubyforge_name =
|
60
|
+
p.rubyforge_name = RUBYFORGE_PROJECT if RUBYFORGE_PROJECT
|
61
61
|
p.test_globs = ["test/**/test_*.rb"]
|
62
62
|
p.clean_globs |= ['**/.*.sw?', '*.gem', '.config', '**/.DS_Store'] #An array of file patterns to delete on clean.
|
63
63
|
|
@@ -0,0 +1,46 @@
|
|
1
|
+
have_header: checking for stdlib.h... -------------------- yes
|
2
|
+
|
3
|
+
"gcc -E -I. -I/System/Library/Frameworks/Ruby.framework/Versions/1.8/usr/lib/ruby/1.8/universal-darwin9.0 -I. -Os -pipe -fno-common conftest.c -o conftest.i"
|
4
|
+
checked program was:
|
5
|
+
/* begin */
|
6
|
+
1: #include <stdlib.h>
|
7
|
+
/* end */
|
8
|
+
|
9
|
+
--------------------
|
10
|
+
|
11
|
+
have_header: checking for string.h... -------------------- yes
|
12
|
+
|
13
|
+
"gcc -E -I. -I/System/Library/Frameworks/Ruby.framework/Versions/1.8/usr/lib/ruby/1.8/universal-darwin9.0 -I. -Os -pipe -fno-common conftest.c -o conftest.i"
|
14
|
+
checked program was:
|
15
|
+
/* begin */
|
16
|
+
1: #include <string.h>
|
17
|
+
/* end */
|
18
|
+
|
19
|
+
--------------------
|
20
|
+
|
21
|
+
have_library: checking for main() in -lc... -------------------- yes
|
22
|
+
|
23
|
+
"gcc -o conftest -I. -I/System/Library/Frameworks/Ruby.framework/Versions/1.8/usr/lib/ruby/1.8/universal-darwin9.0 -I. -arch ppc -arch i386 -Os -pipe -fno-common conftest.c -L"." -L"/System/Library/Frameworks/Ruby.framework/Versions/1.8/usr/lib" -L. -arch ppc -arch i386 -lruby -lc -lpthread -ldl -lm "
|
24
|
+
checked program was:
|
25
|
+
/* begin */
|
26
|
+
1: /*top*/
|
27
|
+
2: int main() { return 0; }
|
28
|
+
3: int t() { void ((*volatile p)()); p = (void ((*)()))main; return 0; }
|
29
|
+
/* end */
|
30
|
+
|
31
|
+
--------------------
|
32
|
+
|
33
|
+
have_func: checking for snprintf() in stdio.h... -------------------- yes
|
34
|
+
|
35
|
+
"gcc -o conftest -I. -I/System/Library/Frameworks/Ruby.framework/Versions/1.8/usr/lib/ruby/1.8/universal-darwin9.0 -I. -arch ppc -arch i386 -Os -pipe -fno-common conftest.c -L"." -L"/System/Library/Frameworks/Ruby.framework/Versions/1.8/usr/lib" -L. -arch ppc -arch i386 -lc -lruby -lc -lpthread -ldl -lm "
|
36
|
+
checked program was:
|
37
|
+
/* begin */
|
38
|
+
1: #include <stdio.h>
|
39
|
+
2:
|
40
|
+
3: /*top*/
|
41
|
+
4: int main() { return 0; }
|
42
|
+
5: int t() { void ((*volatile p)()); p = (void ((*)()))snprintf; return 0; }
|
43
|
+
/* end */
|
44
|
+
|
45
|
+
--------------------
|
46
|
+
|
@@ -0,0 +1,24 @@
|
|
1
|
+
have_library: checking for main() in -lc... -------------------- yes
|
2
|
+
|
3
|
+
"gcc -o conftest -I. -I/System/Library/Frameworks/Ruby.framework/Versions/1.8/usr/lib/ruby/1.8/universal-darwin9.0 -I. -arch ppc -arch i386 -Os -pipe -fno-common conftest.c -L"." -L"/System/Library/Frameworks/Ruby.framework/Versions/1.8/usr/lib" -L. -arch ppc -arch i386 -lruby -lc -lpthread -ldl -lm "
|
4
|
+
checked program was:
|
5
|
+
/* begin */
|
6
|
+
1: /*top*/
|
7
|
+
2: int main() { return 0; }
|
8
|
+
3: int t() { void ((*volatile p)()); p = (void ((*)()))main; return 0; }
|
9
|
+
/* end */
|
10
|
+
|
11
|
+
--------------------
|
12
|
+
|
13
|
+
have_library: checking for main() in -lstdc++... -------------------- yes
|
14
|
+
|
15
|
+
"gcc -o conftest -I. -I/System/Library/Frameworks/Ruby.framework/Versions/1.8/usr/lib/ruby/1.8/universal-darwin9.0 -I. -arch ppc -arch i386 -Os -pipe -fno-common conftest.c -L"." -L"/System/Library/Frameworks/Ruby.framework/Versions/1.8/usr/lib" -L. -arch ppc -arch i386 -lc -lruby -lstdc++ -lc -lpthread -ldl -lm "
|
16
|
+
checked program was:
|
17
|
+
/* begin */
|
18
|
+
1: /*top*/
|
19
|
+
2: int main() { return 0; }
|
20
|
+
3: int t() { void ((*volatile p)()); p = (void ((*)()))main; return 0; }
|
21
|
+
/* end */
|
22
|
+
|
23
|
+
--------------------
|
24
|
+
|
data/ext/word_tagger/rtagger.cc
CHANGED
@@ -70,10 +70,10 @@ VALUE Tagger_alloc(VALUE klass)
|
|
70
70
|
return object;
|
71
71
|
}
|
72
72
|
|
73
|
-
extern "C" void
|
73
|
+
extern "C" void Init_word_tagger()
|
74
74
|
{
|
75
75
|
rb_Tagger = rb_define_module( "Tagger" );
|
76
|
-
rb_NWordTagger = rb_define_class_under( rb_Tagger, "
|
76
|
+
rb_NWordTagger = rb_define_class_under( rb_Tagger, "WordTagger", rb_cObject );
|
77
77
|
|
78
78
|
rb_define_alloc_func( rb_NWordTagger, Tagger_alloc );
|
79
79
|
|
data/ext/word_tagger/tagger.cc
CHANGED
@@ -33,12 +33,21 @@ static std::vector<std::string> word_split(const std::string& s)
|
|
33
33
|
return std::vector<std::string>(std::istream_iterator<std::string>(is), std::istream_iterator<std::string>());
|
34
34
|
}
|
35
35
|
|
36
|
+
static void word_downcase( std::string &word )
|
37
|
+
{
|
38
|
+
for( int j = 0; j < word.size(); ++j ) {
|
39
|
+
word[j] = tolower( word[j] );
|
40
|
+
}
|
41
|
+
}
|
42
|
+
|
36
43
|
NWordTagger::NWordTagger()
|
37
44
|
: nwords(2), stemmer(porter_stemmer_new()){
|
38
45
|
}
|
39
46
|
NWordTagger::~NWordTagger(){
|
40
47
|
porter_stemmer_free(stemmer);
|
41
48
|
}
|
49
|
+
|
50
|
+
|
42
51
|
void NWordTagger::loadTags( const std::set<std::string> &tags )
|
43
52
|
{
|
44
53
|
for( std::set<std::string>::iterator i = tags.begin(); i != tags.end(); ++i ){
|
@@ -51,14 +60,14 @@ void NWordTagger::loadTags( const std::set<std::string> &tags )
|
|
51
60
|
stemmed += this->stemWord(words[j]) + " ";
|
52
61
|
}
|
53
62
|
stemmed = stemmed.substr(0,stemmed.length()-1);
|
54
|
-
this->tags[stemmed] = word;
|
55
|
-
//printf( "word: %s -> %s\n", word.c_str(), stemmed.c_str() );
|
56
63
|
}
|
57
64
|
else{
|
58
65
|
stemmed = this->stemWord(*i);
|
59
|
-
//printf( "word: %s -> %s\n", word.c_str(), stemmed.c_str() );
|
60
|
-
this->tags[stemmed] = word;
|
61
66
|
}
|
67
|
+
// downcase stemmed
|
68
|
+
word_downcase( stemmed );
|
69
|
+
//printf( "word: %s -> %s\n", word.c_str(), stemmed.c_str() );
|
70
|
+
this->tags[stemmed] = word;
|
62
71
|
|
63
72
|
}
|
64
73
|
}
|
@@ -84,6 +93,7 @@ std::vector<std::string> NWordTagger::execute( const char *text, short max )cons
|
|
84
93
|
|
85
94
|
// get the stemmed word at position i
|
86
95
|
match_word = this->stemWord(words[i]);
|
96
|
+
word_downcase( match_word );
|
87
97
|
|
88
98
|
// now scan ahead nwords positions searching our tags table for matches
|
89
99
|
for( short j = 1; (j <= this->nwords) && ((i+j) < words.size()); ++j ) {
|
data/lib/rbtagger/version.rb
CHANGED
data/lib/rbtagger.rb
CHANGED
@@ -2,5 +2,12 @@ $:.unshift(File.dirname(__FILE__)) unless
|
|
2
2
|
$:.include?(File.dirname(__FILE__)) || $:.include?(File.expand_path(File.dirname(__FILE__)))
|
3
3
|
|
4
4
|
module RbTagger
|
5
|
-
|
5
|
+
class << self
|
6
|
+
def tags_from_file( file )
|
7
|
+
File.read(file).split("\n").map{|t| t.strip}
|
8
|
+
end
|
9
|
+
end
|
6
10
|
end
|
11
|
+
|
12
|
+
require 'word/tagger'
|
13
|
+
require 'brill/tagger'
|
data/lib/word/tagger.rb
ADDED
@@ -0,0 +1,18 @@
|
|
1
|
+
module Word
|
2
|
+
require 'word_tagger'
|
3
|
+
class Tagger < Tagger::WordTagger
|
4
|
+
def initialize( tags, options = {} )
|
5
|
+
if tags.is_a?(String) and File.exist?(tags)
|
6
|
+
load_tags( RbTagger.tags_from_file( tags ) )
|
7
|
+
else
|
8
|
+
load_tags( tags )
|
9
|
+
end
|
10
|
+
set_words( options[:words] || 2 )
|
11
|
+
end
|
12
|
+
|
13
|
+
def execute( text )
|
14
|
+
# strip non alpha characters
|
15
|
+
super( text.gsub(/[^\w]/,' ') )
|
16
|
+
end
|
17
|
+
end
|
18
|
+
end
|
data/script/txt2html
CHANGED
@@ -0,0 +1,43 @@
|
|
1
|
+
namespace :extconf do
|
2
|
+
extension = File.basename(__FILE__, '.rake')
|
3
|
+
|
4
|
+
ext = "ext/#{extension}"
|
5
|
+
ext_so = "#{ext}/#{extension}.#{Config::CONFIG['DLEXT']}"
|
6
|
+
ext_files = FileList[
|
7
|
+
"#{ext}/*.c",
|
8
|
+
"#{ext}/*.h",
|
9
|
+
"#{ext}/*.rl",
|
10
|
+
"#{ext}/extconf.rb",
|
11
|
+
"#{ext}/Makefile",
|
12
|
+
# "lib"
|
13
|
+
]
|
14
|
+
|
15
|
+
|
16
|
+
task :compile => extension do
|
17
|
+
if Dir.glob("**/#{extension}.{o,so,dll,bundle}").length == 0
|
18
|
+
STDERR.puts "!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!"
|
19
|
+
STDERR.puts "Gem actually failed to build. Your system is"
|
20
|
+
STDERR.puts "NOT configured properly to build #{GEM_NAME}."
|
21
|
+
STDERR.puts "!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!"
|
22
|
+
exit(1)
|
23
|
+
end
|
24
|
+
end
|
25
|
+
|
26
|
+
desc "Builds just the #{extension} extension"
|
27
|
+
task extension.to_sym => ["#{ext}/Makefile", ext_so ]
|
28
|
+
|
29
|
+
file "#{ext}/Makefile" => ["#{ext}/extconf.rb"] do
|
30
|
+
Dir.chdir(ext) do ruby "extconf.rb" end
|
31
|
+
end
|
32
|
+
|
33
|
+
file ext_so => ext_files do
|
34
|
+
Dir.chdir(ext) do
|
35
|
+
sh(PLATFORM =~ /win32/ ? 'nmake' : 'make') do |ok, res|
|
36
|
+
if !ok
|
37
|
+
require "fileutils"
|
38
|
+
FileUtils.rm Dir.glob('*.{so,o,dll,bundle}')
|
39
|
+
end
|
40
|
+
end
|
41
|
+
end
|
42
|
+
end
|
43
|
+
end
|
@@ -0,0 +1,43 @@
|
|
1
|
+
namespace :extconf do
|
2
|
+
extension = File.basename(__FILE__, '.rake')
|
3
|
+
|
4
|
+
ext = "ext/#{extension}"
|
5
|
+
ext_so = "#{ext}/#{extension}.#{Config::CONFIG['DLEXT']}"
|
6
|
+
ext_files = FileList[
|
7
|
+
"#{ext}/*.c",
|
8
|
+
"#{ext}/*.h",
|
9
|
+
"#{ext}/*.rl",
|
10
|
+
"#{ext}/extconf.rb",
|
11
|
+
"#{ext}/Makefile",
|
12
|
+
# "lib"
|
13
|
+
]
|
14
|
+
|
15
|
+
|
16
|
+
task :compile => extension do
|
17
|
+
if Dir.glob("**/#{extension}.{o,so,dll,bundle}").length == 0
|
18
|
+
STDERR.puts "!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!"
|
19
|
+
STDERR.puts "Gem actually failed to build. Your system is"
|
20
|
+
STDERR.puts "NOT configured properly to build #{GEM_NAME}."
|
21
|
+
STDERR.puts "!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!"
|
22
|
+
exit(1)
|
23
|
+
end
|
24
|
+
end
|
25
|
+
|
26
|
+
desc "Builds just the #{extension} extension"
|
27
|
+
task extension.to_sym => ["#{ext}/Makefile", ext_so ]
|
28
|
+
|
29
|
+
file "#{ext}/Makefile" => ["#{ext}/extconf.rb"] do
|
30
|
+
Dir.chdir(ext) do ruby "extconf.rb" end
|
31
|
+
end
|
32
|
+
|
33
|
+
file ext_so => ext_files do
|
34
|
+
Dir.chdir(ext) do
|
35
|
+
sh(PLATFORM =~ /win32/ ? 'nmake' : 'make') do |ok, res|
|
36
|
+
if !ok
|
37
|
+
require "fileutils"
|
38
|
+
FileUtils.rm Dir.glob('*.{so,o,dll,bundle}')
|
39
|
+
end
|
40
|
+
end
|
41
|
+
end
|
42
|
+
end
|
43
|
+
end
|
data/tasks/extconf.rake
ADDED
@@ -0,0 +1,18 @@
|
|
1
|
+
namespace :extconf do
|
2
|
+
desc "Compiles the Ruby extension"
|
3
|
+
task :compile
|
4
|
+
end
|
5
|
+
|
6
|
+
BIN = "*.{bundle,jar,so,obj,pdb,lib,def,exp}"
|
7
|
+
|
8
|
+
task :compile => "extconf:compile" do
|
9
|
+
Dir["ext/**/*.{bundle,so,dll}"].each do|lib|
|
10
|
+
sh "cp #{lib} lib/"
|
11
|
+
end
|
12
|
+
end
|
13
|
+
|
14
|
+
task :test => :compile
|
15
|
+
|
16
|
+
$hoe.clean_globs |= ["ext/**/#{BIN}", "lib/**/#{BIN}", 'ext/word_tagger/Makefile', 'ext/rule_tagger/Makefile']
|
17
|
+
$hoe.spec.require_paths = Dir['{lib,ext/*}']
|
18
|
+
$hoe.spec.extensions = FileList["ext/**/extconf.rb"].to_a
|