rbtagger 0.4.6 → 0.4.7
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/Rakefile +3 -0
- data/lib/brill/tagger.rb +14 -12
- data/lib/rbtagger.rb +1 -0
- data/lib/rbtagger/version.rb +2 -1
- data/lib/word/tagger.rb +1 -0
- metadata +17 -5
data/Rakefile
CHANGED
@@ -1,6 +1,9 @@
|
|
1
1
|
require 'rake/clean'
|
2
2
|
require 'rake/testtask'
|
3
3
|
require 'rake/rdoctask'
|
4
|
+
$:.unshift File.expand_path(File.dirname(__FILE__))
|
5
|
+
$:.unshift File.expand_path(File.dirname(__FILE__), 'lib')
|
6
|
+
$:.unshift File.expand_path(File.dirname(__FILE__), 'ext')
|
4
7
|
|
5
8
|
CLEAN.include '**/*.o'
|
6
9
|
CLEAN.include "**/*.#{Config::MAKEFILE_CONFIG['DLEXT']}"
|
data/lib/brill/tagger.rb
CHANGED
@@ -1,3 +1,5 @@
|
|
1
|
+
# encoding: utf-8
|
2
|
+
|
1
3
|
require 'rule_tagger/rule_tagger'
|
2
4
|
|
3
5
|
module Brill
|
@@ -29,7 +31,7 @@ module Brill
|
|
29
31
|
# see: http://cpansearch.perl.org/src/ACOBURN/Lingua-EN-Tagger-0.15/Tagger.pm
|
30
32
|
def noun_phrases(text)
|
31
33
|
# ?:$PREP|$DET|$NUM)
|
32
|
-
#
|
34
|
+
#
|
33
35
|
tags = tag(text.gsub(/[^\w]/,' '))
|
34
36
|
phrases = []
|
35
37
|
phrase = []
|
@@ -62,7 +64,7 @@ module Brill
|
|
62
64
|
# join NNP's together for names
|
63
65
|
reduced_tags = []
|
64
66
|
mappings = {} # keep a mapping of the joined words to expand
|
65
|
-
tags.each{|tag|
|
67
|
+
tags.each{|tag|
|
66
68
|
if ptag.last == 'NNP' and tag.last == 'NNP' and !ptag.first.match(/\.$/)
|
67
69
|
ptag[0] += " " + tag.first
|
68
70
|
# before combining these two create a mapping for each word to each word
|
@@ -119,7 +121,7 @@ module Brill
|
|
119
121
|
end
|
120
122
|
|
121
123
|
# Tag a body of text
|
122
|
-
# returns an array like [[token,tag],[token,tag]...[token,tag]]
|
124
|
+
# returns an array like [[token,tag],[token,tag]...[token,tag]]
|
123
125
|
#
|
124
126
|
def tag( text )
|
125
127
|
# XXX: the list of contractions is much larger then this... find'em
|
@@ -130,14 +132,14 @@ module Brill
|
|
130
132
|
|
131
133
|
@tagger.apply_lexical_rules( tokens, tags, [], 0 )
|
132
134
|
@tagger.default_tag_finish( tokens, tags )
|
133
|
-
|
135
|
+
|
134
136
|
# Brill uses these fake "STAART" tags to delimit the start & end of sentence.
|
135
|
-
tokens << "STAART"
|
136
|
-
tokens << "STAART"
|
137
|
+
tokens << "STAART"
|
138
|
+
tokens << "STAART"
|
137
139
|
tokens.unshift "STAART"
|
138
140
|
tokens.unshift "STAART"
|
139
|
-
tags << "STAART"
|
140
|
-
tags << "STAART"
|
141
|
+
tags << "STAART"
|
142
|
+
tags << "STAART"
|
141
143
|
tags.unshift "STAART"
|
142
144
|
tags.unshift "STAART"
|
143
145
|
|
@@ -166,7 +168,7 @@ module Brill
|
|
166
168
|
end
|
167
169
|
lines
|
168
170
|
end
|
169
|
-
# load LEXICON
|
171
|
+
# load LEXICON
|
170
172
|
def self.load_lexicon(tagger,lexicon)
|
171
173
|
lines = Brill::Tagger.lines(lexicon)
|
172
174
|
i = 0
|
@@ -187,7 +189,7 @@ module Brill
|
|
187
189
|
end
|
188
190
|
end
|
189
191
|
|
190
|
-
# load LEXICALRULEFILE
|
192
|
+
# load LEXICALRULEFILE
|
191
193
|
def self.load_lexical_rules(tagger,rules)
|
192
194
|
lines = self.lines(rules)
|
193
195
|
i = 0
|
@@ -273,7 +275,7 @@ module Brill
|
|
273
275
|
# Isolate any embedded punctuation chars
|
274
276
|
# s{([,;:\@\#\$\%&])} { $1 }g;
|
275
277
|
text.gsub!(/([,;:\@\#\$\%&])/, ' \1 ')
|
276
|
-
|
278
|
+
|
277
279
|
# Assume sentence tokenization has been done first, so split FINAL
|
278
280
|
# periods only.
|
279
281
|
# s/ ([^.]) \. ([\]\)\}\>\"\']*) [ \t]* $ /$1 .$2 /gx;
|
@@ -329,7 +331,7 @@ module Brill
|
|
329
331
|
text.gsub!(/ (\'t)(is|was) /i,' \1 \2 ')
|
330
332
|
#s/ (wan)(na) / $1 $2 /ig;
|
331
333
|
text.gsub!(/ (wan)(na) /i,' \1 \2 ')
|
332
|
-
|
334
|
+
|
333
335
|
text.split(/\s/)
|
334
336
|
end
|
335
337
|
|
data/lib/rbtagger.rb
CHANGED
data/lib/rbtagger/version.rb
CHANGED
data/lib/word/tagger.rb
CHANGED
metadata
CHANGED
@@ -1,7 +1,13 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: rbtagger
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
|
4
|
+
hash: 1
|
5
|
+
prerelease: false
|
6
|
+
segments:
|
7
|
+
- 0
|
8
|
+
- 4
|
9
|
+
- 7
|
10
|
+
version: 0.4.7
|
5
11
|
platform: ruby
|
6
12
|
authors:
|
7
13
|
- Todd A. Fisher
|
@@ -9,7 +15,7 @@ autorequire:
|
|
9
15
|
bindir: bin
|
10
16
|
cert_chain: []
|
11
17
|
|
12
|
-
date:
|
18
|
+
date: 2011-03-30 00:00:00 -04:00
|
13
19
|
default_executable:
|
14
20
|
dependencies: []
|
15
21
|
|
@@ -76,21 +82,27 @@ require_paths:
|
|
76
82
|
- lib
|
77
83
|
- ext
|
78
84
|
required_ruby_version: !ruby/object:Gem::Requirement
|
85
|
+
none: false
|
79
86
|
requirements:
|
80
87
|
- - ">="
|
81
88
|
- !ruby/object:Gem::Version
|
89
|
+
hash: 3
|
90
|
+
segments:
|
91
|
+
- 0
|
82
92
|
version: "0"
|
83
|
-
version:
|
84
93
|
required_rubygems_version: !ruby/object:Gem::Requirement
|
94
|
+
none: false
|
85
95
|
requirements:
|
86
96
|
- - ">="
|
87
97
|
- !ruby/object:Gem::Version
|
98
|
+
hash: 3
|
99
|
+
segments:
|
100
|
+
- 0
|
88
101
|
version: "0"
|
89
|
-
version:
|
90
102
|
requirements: []
|
91
103
|
|
92
104
|
rubyforge_project: curb
|
93
|
-
rubygems_version: 1.3.
|
105
|
+
rubygems_version: 1.3.7
|
94
106
|
signing_key:
|
95
107
|
specification_version: 3
|
96
108
|
summary: Ruby libcurl bindings
|