rbtagger 0.0.1 → 0.2.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/Manifest.txt +9 -2
- data/Rakefile +0 -29
- data/config/hoe.rb +2 -2
- data/ext/rule_tagger/mkmf.log +46 -0
- data/ext/word_tagger/mkmf.log +24 -0
- data/ext/word_tagger/rtagger.cc +2 -2
- data/ext/word_tagger/tagger.cc +14 -4
- data/lib/rbtagger/version.rb +2 -2
- data/lib/rbtagger.rb +8 -1
- data/lib/word/tagger.rb +18 -0
- data/script/txt2html +1 -1
- data/tasks/extconf/rule_tagger.rake +43 -0
- data/tasks/extconf/word_tagger.rake +43 -0
- data/tasks/extconf.rake +18 -0
- data/test/fixtures/tags.txt +976 -0
- data/test/{tagger_test.rb → test_rule_tagger.rb} +15 -14
- data/test/test_word_tagger.rb +33 -0
- data/website/index.html +12 -2
- data/website/index.txt +11 -1
- metadata +16 -6
- data/ext/word_tagger/tagger.rb +0 -8
data/Manifest.txt
CHANGED
@@ -18,6 +18,7 @@ ext/rule_tagger/lex.c
|
|
18
18
|
ext/rule_tagger/lex.h
|
19
19
|
ext/rule_tagger/memory.c
|
20
20
|
ext/rule_tagger/memory.h
|
21
|
+
ext/rule_tagger/mkmf.log
|
21
22
|
ext/rule_tagger/rbtagger.c
|
22
23
|
ext/rule_tagger/registry.c
|
23
24
|
ext/rule_tagger/registry.h
|
@@ -31,12 +32,12 @@ ext/rule_tagger/tagger.h
|
|
31
32
|
ext/rule_tagger/useful.c
|
32
33
|
ext/rule_tagger/useful.h
|
33
34
|
ext/word_tagger/extconf.rb
|
35
|
+
ext/word_tagger/mkmf.log
|
34
36
|
ext/word_tagger/porter_stemmer.c
|
35
37
|
ext/word_tagger/porter_stemmer.h
|
36
38
|
ext/word_tagger/rtagger.cc
|
37
39
|
ext/word_tagger/tagger.cc
|
38
40
|
ext/word_tagger/tagger.h
|
39
|
-
ext/word_tagger/tagger.rb
|
40
41
|
ext/word_tagger/test.rb
|
41
42
|
ext/word_tagger/test/Makefile
|
42
43
|
ext/word_tagger/test/doc.txt
|
@@ -44,6 +45,7 @@ ext/word_tagger/test/test.cc
|
|
44
45
|
lib/brill/tagger.rb
|
45
46
|
lib/rbtagger.rb
|
46
47
|
lib/rbtagger/version.rb
|
48
|
+
lib/word/tagger.rb
|
47
49
|
script/console
|
48
50
|
script/destroy
|
49
51
|
script/generate
|
@@ -51,6 +53,9 @@ script/txt2html
|
|
51
53
|
setup.rb
|
52
54
|
tasks/deployment.rake
|
53
55
|
tasks/environment.rake
|
56
|
+
tasks/extconf.rake
|
57
|
+
tasks/extconf/rule_tagger.rake
|
58
|
+
tasks/extconf/word_tagger.rake
|
54
59
|
tasks/website.rake
|
55
60
|
test/CONTEXTUALRULEFILE
|
56
61
|
test/LEXICALRULEFILE
|
@@ -65,8 +70,10 @@ test/docs/doc6.txt
|
|
65
70
|
test/docs/doc7.txt
|
66
71
|
test/docs/doc8.txt
|
67
72
|
test/docs/doc9.txt
|
68
|
-
test/
|
73
|
+
test/fixtures/tags.txt
|
69
74
|
test/test_helper.rb
|
75
|
+
test/test_rule_tagger.rb
|
76
|
+
test/test_word_tagger.rb
|
70
77
|
tools/rakehelp.rb
|
71
78
|
website/index.html
|
72
79
|
website/index.txt
|
data/Rakefile
CHANGED
@@ -2,32 +2,3 @@ require 'config/requirements'
|
|
2
2
|
require 'config/hoe' # setup Hoe + all gem configuration
|
3
3
|
|
4
4
|
Dir['tasks/**/*.rake'].each { |rake| load rake }
|
5
|
-
|
6
|
-
# redefine release
|
7
|
-
|
8
|
-
desc 'Package and upload the release to rubyforge.'
|
9
|
-
task :release_current => [:clean, :package] do |t|
|
10
|
-
require 'config/hoe'
|
11
|
-
version = ENV["VERSION"] or abort "Must supply VERSION=x.y.z"
|
12
|
-
name = $hoe.name
|
13
|
-
rubyforge_name = $hoe.rubyforge_name
|
14
|
-
description = $hoe.description
|
15
|
-
pkg = "pkg/#{name}-#{version}"
|
16
|
-
abort "Package doesn't exist => #{pkg}" if !File.exist?(pkg)
|
17
|
-
|
18
|
-
rf = RubyForge.new
|
19
|
-
puts "Logging in"
|
20
|
-
rf.login
|
21
|
-
|
22
|
-
c = rf.userconfig
|
23
|
-
c["release_notes"] = description if description
|
24
|
-
c["release_changes"] = $hoe.changes if $hoe.changes
|
25
|
-
c["preformatted"] = true
|
26
|
-
|
27
|
-
files = [(@need_tar ? "#{pkg}.tgz" : nil),
|
28
|
-
(@need_zip ? "#{pkg}.zip" : nil),
|
29
|
-
"#{pkg}.gem"].compact
|
30
|
-
|
31
|
-
puts "Releasing #{rubyforge_name} v. #{version}"
|
32
|
-
rf.add_release 'ruletagger', 'ruletagger', version, *files
|
33
|
-
end
|
data/config/hoe.rb
CHANGED
@@ -4,7 +4,7 @@ AUTHOR = 'Todd A. Fisher' # can also be an array of Authors
|
|
4
4
|
EMAIL = 'todd.fisher@gmail.com'
|
5
5
|
DESCRIPTION = "A Simple Ruby Rule-Based Part of Speech Tagger"
|
6
6
|
GEM_NAME = 'rbtagger' # what ppl will type to install your gem
|
7
|
-
RUBYFORGE_PROJECT = '
|
7
|
+
RUBYFORGE_PROJECT = 'rbtagger' # The unix name for your project
|
8
8
|
HOMEPATH = "http://#{RUBYFORGE_PROJECT}.rubyforge.org"
|
9
9
|
DOWNLOAD_PATH = "http://rubyforge.org/projects/#{RUBYFORGE_PROJECT}"
|
10
10
|
EXTRA_DEPENDENCIES = [
|
@@ -57,7 +57,7 @@ $hoe = Hoe.new(GEM_NAME, VERS) do |p|
|
|
57
57
|
p.description = DESCRIPTION
|
58
58
|
p.summary = DESCRIPTION
|
59
59
|
p.url = HOMEPATH
|
60
|
-
p.rubyforge_name =
|
60
|
+
p.rubyforge_name = RUBYFORGE_PROJECT if RUBYFORGE_PROJECT
|
61
61
|
p.test_globs = ["test/**/test_*.rb"]
|
62
62
|
p.clean_globs |= ['**/.*.sw?', '*.gem', '.config', '**/.DS_Store'] #An array of file patterns to delete on clean.
|
63
63
|
|
@@ -0,0 +1,46 @@
|
|
1
|
+
have_header: checking for stdlib.h... -------------------- yes
|
2
|
+
|
3
|
+
"gcc -E -I. -I/System/Library/Frameworks/Ruby.framework/Versions/1.8/usr/lib/ruby/1.8/universal-darwin9.0 -I. -Os -pipe -fno-common conftest.c -o conftest.i"
|
4
|
+
checked program was:
|
5
|
+
/* begin */
|
6
|
+
1: #include <stdlib.h>
|
7
|
+
/* end */
|
8
|
+
|
9
|
+
--------------------
|
10
|
+
|
11
|
+
have_header: checking for string.h... -------------------- yes
|
12
|
+
|
13
|
+
"gcc -E -I. -I/System/Library/Frameworks/Ruby.framework/Versions/1.8/usr/lib/ruby/1.8/universal-darwin9.0 -I. -Os -pipe -fno-common conftest.c -o conftest.i"
|
14
|
+
checked program was:
|
15
|
+
/* begin */
|
16
|
+
1: #include <string.h>
|
17
|
+
/* end */
|
18
|
+
|
19
|
+
--------------------
|
20
|
+
|
21
|
+
have_library: checking for main() in -lc... -------------------- yes
|
22
|
+
|
23
|
+
"gcc -o conftest -I. -I/System/Library/Frameworks/Ruby.framework/Versions/1.8/usr/lib/ruby/1.8/universal-darwin9.0 -I. -arch ppc -arch i386 -Os -pipe -fno-common conftest.c -L"." -L"/System/Library/Frameworks/Ruby.framework/Versions/1.8/usr/lib" -L. -arch ppc -arch i386 -lruby -lc -lpthread -ldl -lm "
|
24
|
+
checked program was:
|
25
|
+
/* begin */
|
26
|
+
1: /*top*/
|
27
|
+
2: int main() { return 0; }
|
28
|
+
3: int t() { void ((*volatile p)()); p = (void ((*)()))main; return 0; }
|
29
|
+
/* end */
|
30
|
+
|
31
|
+
--------------------
|
32
|
+
|
33
|
+
have_func: checking for snprintf() in stdio.h... -------------------- yes
|
34
|
+
|
35
|
+
"gcc -o conftest -I. -I/System/Library/Frameworks/Ruby.framework/Versions/1.8/usr/lib/ruby/1.8/universal-darwin9.0 -I. -arch ppc -arch i386 -Os -pipe -fno-common conftest.c -L"." -L"/System/Library/Frameworks/Ruby.framework/Versions/1.8/usr/lib" -L. -arch ppc -arch i386 -lc -lruby -lc -lpthread -ldl -lm "
|
36
|
+
checked program was:
|
37
|
+
/* begin */
|
38
|
+
1: #include <stdio.h>
|
39
|
+
2:
|
40
|
+
3: /*top*/
|
41
|
+
4: int main() { return 0; }
|
42
|
+
5: int t() { void ((*volatile p)()); p = (void ((*)()))snprintf; return 0; }
|
43
|
+
/* end */
|
44
|
+
|
45
|
+
--------------------
|
46
|
+
|
@@ -0,0 +1,24 @@
|
|
1
|
+
have_library: checking for main() in -lc... -------------------- yes
|
2
|
+
|
3
|
+
"gcc -o conftest -I. -I/System/Library/Frameworks/Ruby.framework/Versions/1.8/usr/lib/ruby/1.8/universal-darwin9.0 -I. -arch ppc -arch i386 -Os -pipe -fno-common conftest.c -L"." -L"/System/Library/Frameworks/Ruby.framework/Versions/1.8/usr/lib" -L. -arch ppc -arch i386 -lruby -lc -lpthread -ldl -lm "
|
4
|
+
checked program was:
|
5
|
+
/* begin */
|
6
|
+
1: /*top*/
|
7
|
+
2: int main() { return 0; }
|
8
|
+
3: int t() { void ((*volatile p)()); p = (void ((*)()))main; return 0; }
|
9
|
+
/* end */
|
10
|
+
|
11
|
+
--------------------
|
12
|
+
|
13
|
+
have_library: checking for main() in -lstdc++... -------------------- yes
|
14
|
+
|
15
|
+
"gcc -o conftest -I. -I/System/Library/Frameworks/Ruby.framework/Versions/1.8/usr/lib/ruby/1.8/universal-darwin9.0 -I. -arch ppc -arch i386 -Os -pipe -fno-common conftest.c -L"." -L"/System/Library/Frameworks/Ruby.framework/Versions/1.8/usr/lib" -L. -arch ppc -arch i386 -lc -lruby -lstdc++ -lc -lpthread -ldl -lm "
|
16
|
+
checked program was:
|
17
|
+
/* begin */
|
18
|
+
1: /*top*/
|
19
|
+
2: int main() { return 0; }
|
20
|
+
3: int t() { void ((*volatile p)()); p = (void ((*)()))main; return 0; }
|
21
|
+
/* end */
|
22
|
+
|
23
|
+
--------------------
|
24
|
+
|
data/ext/word_tagger/rtagger.cc
CHANGED
@@ -70,10 +70,10 @@ VALUE Tagger_alloc(VALUE klass)
|
|
70
70
|
return object;
|
71
71
|
}
|
72
72
|
|
73
|
-
extern "C" void
|
73
|
+
extern "C" void Init_word_tagger()
|
74
74
|
{
|
75
75
|
rb_Tagger = rb_define_module( "Tagger" );
|
76
|
-
rb_NWordTagger = rb_define_class_under( rb_Tagger, "
|
76
|
+
rb_NWordTagger = rb_define_class_under( rb_Tagger, "WordTagger", rb_cObject );
|
77
77
|
|
78
78
|
rb_define_alloc_func( rb_NWordTagger, Tagger_alloc );
|
79
79
|
|
data/ext/word_tagger/tagger.cc
CHANGED
@@ -33,12 +33,21 @@ static std::vector<std::string> word_split(const std::string& s)
|
|
33
33
|
return std::vector<std::string>(std::istream_iterator<std::string>(is), std::istream_iterator<std::string>());
|
34
34
|
}
|
35
35
|
|
36
|
+
static void word_downcase( std::string &word )
|
37
|
+
{
|
38
|
+
for( int j = 0; j < word.size(); ++j ) {
|
39
|
+
word[j] = tolower( word[j] );
|
40
|
+
}
|
41
|
+
}
|
42
|
+
|
36
43
|
NWordTagger::NWordTagger()
|
37
44
|
: nwords(2), stemmer(porter_stemmer_new()){
|
38
45
|
}
|
39
46
|
NWordTagger::~NWordTagger(){
|
40
47
|
porter_stemmer_free(stemmer);
|
41
48
|
}
|
49
|
+
|
50
|
+
|
42
51
|
void NWordTagger::loadTags( const std::set<std::string> &tags )
|
43
52
|
{
|
44
53
|
for( std::set<std::string>::iterator i = tags.begin(); i != tags.end(); ++i ){
|
@@ -51,14 +60,14 @@ void NWordTagger::loadTags( const std::set<std::string> &tags )
|
|
51
60
|
stemmed += this->stemWord(words[j]) + " ";
|
52
61
|
}
|
53
62
|
stemmed = stemmed.substr(0,stemmed.length()-1);
|
54
|
-
this->tags[stemmed] = word;
|
55
|
-
//printf( "word: %s -> %s\n", word.c_str(), stemmed.c_str() );
|
56
63
|
}
|
57
64
|
else{
|
58
65
|
stemmed = this->stemWord(*i);
|
59
|
-
//printf( "word: %s -> %s\n", word.c_str(), stemmed.c_str() );
|
60
|
-
this->tags[stemmed] = word;
|
61
66
|
}
|
67
|
+
// downcase stemmed
|
68
|
+
word_downcase( stemmed );
|
69
|
+
//printf( "word: %s -> %s\n", word.c_str(), stemmed.c_str() );
|
70
|
+
this->tags[stemmed] = word;
|
62
71
|
|
63
72
|
}
|
64
73
|
}
|
@@ -84,6 +93,7 @@ std::vector<std::string> NWordTagger::execute( const char *text, short max )cons
|
|
84
93
|
|
85
94
|
// get the stemmed word at position i
|
86
95
|
match_word = this->stemWord(words[i]);
|
96
|
+
word_downcase( match_word );
|
87
97
|
|
88
98
|
// now scan ahead nwords positions searching our tags table for matches
|
89
99
|
for( short j = 1; (j <= this->nwords) && ((i+j) < words.size()); ++j ) {
|
data/lib/rbtagger/version.rb
CHANGED
data/lib/rbtagger.rb
CHANGED
@@ -2,5 +2,12 @@ $:.unshift(File.dirname(__FILE__)) unless
|
|
2
2
|
$:.include?(File.dirname(__FILE__)) || $:.include?(File.expand_path(File.dirname(__FILE__)))
|
3
3
|
|
4
4
|
module RbTagger
|
5
|
-
|
5
|
+
class << self
|
6
|
+
def tags_from_file( file )
|
7
|
+
File.read(file).split("\n").map{|t| t.strip}
|
8
|
+
end
|
9
|
+
end
|
6
10
|
end
|
11
|
+
|
12
|
+
require 'word/tagger'
|
13
|
+
require 'brill/tagger'
|
data/lib/word/tagger.rb
ADDED
@@ -0,0 +1,18 @@
|
|
1
|
+
module Word
|
2
|
+
require 'word_tagger'
|
3
|
+
class Tagger < Tagger::WordTagger
|
4
|
+
def initialize( tags, options = {} )
|
5
|
+
if tags.is_a?(String) and File.exist?(tags)
|
6
|
+
load_tags( RbTagger.tags_from_file( tags ) )
|
7
|
+
else
|
8
|
+
load_tags( tags )
|
9
|
+
end
|
10
|
+
set_words( options[:words] || 2 )
|
11
|
+
end
|
12
|
+
|
13
|
+
def execute( text )
|
14
|
+
# strip non alpha characters
|
15
|
+
super( text.gsub(/[^\w]/,' ') )
|
16
|
+
end
|
17
|
+
end
|
18
|
+
end
|
data/script/txt2html
CHANGED
@@ -0,0 +1,43 @@
|
|
1
|
+
namespace :extconf do
|
2
|
+
extension = File.basename(__FILE__, '.rake')
|
3
|
+
|
4
|
+
ext = "ext/#{extension}"
|
5
|
+
ext_so = "#{ext}/#{extension}.#{Config::CONFIG['DLEXT']}"
|
6
|
+
ext_files = FileList[
|
7
|
+
"#{ext}/*.c",
|
8
|
+
"#{ext}/*.h",
|
9
|
+
"#{ext}/*.rl",
|
10
|
+
"#{ext}/extconf.rb",
|
11
|
+
"#{ext}/Makefile",
|
12
|
+
# "lib"
|
13
|
+
]
|
14
|
+
|
15
|
+
|
16
|
+
task :compile => extension do
|
17
|
+
if Dir.glob("**/#{extension}.{o,so,dll,bundle}").length == 0
|
18
|
+
STDERR.puts "!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!"
|
19
|
+
STDERR.puts "Gem actually failed to build. Your system is"
|
20
|
+
STDERR.puts "NOT configured properly to build #{GEM_NAME}."
|
21
|
+
STDERR.puts "!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!"
|
22
|
+
exit(1)
|
23
|
+
end
|
24
|
+
end
|
25
|
+
|
26
|
+
desc "Builds just the #{extension} extension"
|
27
|
+
task extension.to_sym => ["#{ext}/Makefile", ext_so ]
|
28
|
+
|
29
|
+
file "#{ext}/Makefile" => ["#{ext}/extconf.rb"] do
|
30
|
+
Dir.chdir(ext) do ruby "extconf.rb" end
|
31
|
+
end
|
32
|
+
|
33
|
+
file ext_so => ext_files do
|
34
|
+
Dir.chdir(ext) do
|
35
|
+
sh(PLATFORM =~ /win32/ ? 'nmake' : 'make') do |ok, res|
|
36
|
+
if !ok
|
37
|
+
require "fileutils"
|
38
|
+
FileUtils.rm Dir.glob('*.{so,o,dll,bundle}')
|
39
|
+
end
|
40
|
+
end
|
41
|
+
end
|
42
|
+
end
|
43
|
+
end
|
@@ -0,0 +1,43 @@
|
|
1
|
+
namespace :extconf do
|
2
|
+
extension = File.basename(__FILE__, '.rake')
|
3
|
+
|
4
|
+
ext = "ext/#{extension}"
|
5
|
+
ext_so = "#{ext}/#{extension}.#{Config::CONFIG['DLEXT']}"
|
6
|
+
ext_files = FileList[
|
7
|
+
"#{ext}/*.c",
|
8
|
+
"#{ext}/*.h",
|
9
|
+
"#{ext}/*.rl",
|
10
|
+
"#{ext}/extconf.rb",
|
11
|
+
"#{ext}/Makefile",
|
12
|
+
# "lib"
|
13
|
+
]
|
14
|
+
|
15
|
+
|
16
|
+
task :compile => extension do
|
17
|
+
if Dir.glob("**/#{extension}.{o,so,dll,bundle}").length == 0
|
18
|
+
STDERR.puts "!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!"
|
19
|
+
STDERR.puts "Gem actually failed to build. Your system is"
|
20
|
+
STDERR.puts "NOT configured properly to build #{GEM_NAME}."
|
21
|
+
STDERR.puts "!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!"
|
22
|
+
exit(1)
|
23
|
+
end
|
24
|
+
end
|
25
|
+
|
26
|
+
desc "Builds just the #{extension} extension"
|
27
|
+
task extension.to_sym => ["#{ext}/Makefile", ext_so ]
|
28
|
+
|
29
|
+
file "#{ext}/Makefile" => ["#{ext}/extconf.rb"] do
|
30
|
+
Dir.chdir(ext) do ruby "extconf.rb" end
|
31
|
+
end
|
32
|
+
|
33
|
+
file ext_so => ext_files do
|
34
|
+
Dir.chdir(ext) do
|
35
|
+
sh(PLATFORM =~ /win32/ ? 'nmake' : 'make') do |ok, res|
|
36
|
+
if !ok
|
37
|
+
require "fileutils"
|
38
|
+
FileUtils.rm Dir.glob('*.{so,o,dll,bundle}')
|
39
|
+
end
|
40
|
+
end
|
41
|
+
end
|
42
|
+
end
|
43
|
+
end
|
data/tasks/extconf.rake
ADDED
@@ -0,0 +1,18 @@
|
|
1
|
+
namespace :extconf do
|
2
|
+
desc "Compiles the Ruby extension"
|
3
|
+
task :compile
|
4
|
+
end
|
5
|
+
|
6
|
+
BIN = "*.{bundle,jar,so,obj,pdb,lib,def,exp}"
|
7
|
+
|
8
|
+
task :compile => "extconf:compile" do
|
9
|
+
Dir["ext/**/*.{bundle,so,dll}"].each do|lib|
|
10
|
+
sh "cp #{lib} lib/"
|
11
|
+
end
|
12
|
+
end
|
13
|
+
|
14
|
+
task :test => :compile
|
15
|
+
|
16
|
+
$hoe.clean_globs |= ["ext/**/#{BIN}", "lib/**/#{BIN}", 'ext/word_tagger/Makefile', 'ext/rule_tagger/Makefile']
|
17
|
+
$hoe.spec.require_paths = Dir['{lib,ext/*}']
|
18
|
+
$hoe.spec.extensions = FileList["ext/**/extconf.rb"].to_a
|