rbtagger 0.4.6 → 0.4.7

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
data/Rakefile CHANGED
@@ -1,6 +1,9 @@
1
1
  require 'rake/clean'
2
2
  require 'rake/testtask'
3
3
  require 'rake/rdoctask'
4
+ $:.unshift File.expand_path(File.dirname(__FILE__))
5
+ $:.unshift File.expand_path(File.dirname(__FILE__), 'lib')
6
+ $:.unshift File.expand_path(File.dirname(__FILE__), 'ext')
4
7
 
5
8
  CLEAN.include '**/*.o'
6
9
  CLEAN.include "**/*.#{Config::MAKEFILE_CONFIG['DLEXT']}"
@@ -1,3 +1,5 @@
1
+ # encoding: utf-8
2
+
1
3
  require 'rule_tagger/rule_tagger'
2
4
 
3
5
  module Brill
@@ -29,7 +31,7 @@ module Brill
29
31
  # see: http://cpansearch.perl.org/src/ACOBURN/Lingua-EN-Tagger-0.15/Tagger.pm
30
32
  def noun_phrases(text)
31
33
  # ?:$PREP|$DET|$NUM)
32
- #
34
+ #
33
35
  tags = tag(text.gsub(/[^\w]/,' '))
34
36
  phrases = []
35
37
  phrase = []
@@ -62,7 +64,7 @@ module Brill
62
64
  # join NNP's together for names
63
65
  reduced_tags = []
64
66
  mappings = {} # keep a mapping of the joined words to expand
65
- tags.each{|tag|
67
+ tags.each{|tag|
66
68
  if ptag.last == 'NNP' and tag.last == 'NNP' and !ptag.first.match(/\.$/)
67
69
  ptag[0] += " " + tag.first
68
70
  # before combining these two create a mapping for each word to each word
@@ -119,7 +121,7 @@ module Brill
119
121
  end
120
122
 
121
123
  # Tag a body of text
122
- # returns an array like [[token,tag],[token,tag]...[token,tag]]
124
+ # returns an array like [[token,tag],[token,tag]...[token,tag]]
123
125
  #
124
126
  def tag( text )
125
127
  # XXX: the list of contractions is much larger then this... find'em
@@ -130,14 +132,14 @@ module Brill
130
132
 
131
133
  @tagger.apply_lexical_rules( tokens, tags, [], 0 )
132
134
  @tagger.default_tag_finish( tokens, tags )
133
-
135
+
134
136
  # Brill uses these fake "STAART" tags to delimit the start & end of sentence.
135
- tokens << "STAART"
136
- tokens << "STAART"
137
+ tokens << "STAART"
138
+ tokens << "STAART"
137
139
  tokens.unshift "STAART"
138
140
  tokens.unshift "STAART"
139
- tags << "STAART"
140
- tags << "STAART"
141
+ tags << "STAART"
142
+ tags << "STAART"
141
143
  tags.unshift "STAART"
142
144
  tags.unshift "STAART"
143
145
 
@@ -166,7 +168,7 @@ module Brill
166
168
  end
167
169
  lines
168
170
  end
169
- # load LEXICON
171
+ # load LEXICON
170
172
  def self.load_lexicon(tagger,lexicon)
171
173
  lines = Brill::Tagger.lines(lexicon)
172
174
  i = 0
@@ -187,7 +189,7 @@ module Brill
187
189
  end
188
190
  end
189
191
 
190
- # load LEXICALRULEFILE
192
+ # load LEXICALRULEFILE
191
193
  def self.load_lexical_rules(tagger,rules)
192
194
  lines = self.lines(rules)
193
195
  i = 0
@@ -273,7 +275,7 @@ module Brill
273
275
  # Isolate any embedded punctuation chars
274
276
  # s{([,;:\@\#\$\%&])} { $1 }g;
275
277
  text.gsub!(/([,;:\@\#\$\%&])/, ' \1 ')
276
-
278
+
277
279
  # Assume sentence tokenization has been done first, so split FINAL
278
280
  # periods only.
279
281
  # s/ ([^.]) \. ([\]\)\}\>\"\']*) [ \t]* $ /$1 .$2 /gx;
@@ -329,7 +331,7 @@ module Brill
329
331
  text.gsub!(/ (\'t)(is|was) /i,' \1 \2 ')
330
332
  #s/ (wan)(na) / $1 $2 /ig;
331
333
  text.gsub!(/ (wan)(na) /i,' \1 \2 ')
332
-
334
+
333
335
  text.split(/\s/)
334
336
  end
335
337
 
@@ -1,3 +1,4 @@
1
+ # encoding: utf-8
1
2
  module RbTagger
2
3
  class << self
3
4
  def tags_from_file( file )
@@ -1,8 +1,9 @@
1
+ # encoding: utf-8
1
2
  module RbTagger #:nodoc:
2
3
  module VERSION #:nodoc:
3
4
  MAJOR = 0
4
5
  MINOR = 4
5
- TINY = 6
6
+ TINY = 7
6
7
 
7
8
  STRING = [MAJOR, MINOR, TINY].join('.')
8
9
  end
@@ -1,3 +1,4 @@
1
+ # encoding: utf-8
1
2
  require 'word_tagger/word_tagger'
2
3
 
3
4
  module Word
metadata CHANGED
@@ -1,7 +1,13 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: rbtagger
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.4.6
4
+ hash: 1
5
+ prerelease: false
6
+ segments:
7
+ - 0
8
+ - 4
9
+ - 7
10
+ version: 0.4.7
5
11
  platform: ruby
6
12
  authors:
7
13
  - Todd A. Fisher
@@ -9,7 +15,7 @@ autorequire:
9
15
  bindir: bin
10
16
  cert_chain: []
11
17
 
12
- date: 2010-01-25 00:00:00 -05:00
18
+ date: 2011-03-30 00:00:00 -04:00
13
19
  default_executable:
14
20
  dependencies: []
15
21
 
@@ -76,21 +82,27 @@ require_paths:
76
82
  - lib
77
83
  - ext
78
84
  required_ruby_version: !ruby/object:Gem::Requirement
85
+ none: false
79
86
  requirements:
80
87
  - - ">="
81
88
  - !ruby/object:Gem::Version
89
+ hash: 3
90
+ segments:
91
+ - 0
82
92
  version: "0"
83
- version:
84
93
  required_rubygems_version: !ruby/object:Gem::Requirement
94
+ none: false
85
95
  requirements:
86
96
  - - ">="
87
97
  - !ruby/object:Gem::Version
98
+ hash: 3
99
+ segments:
100
+ - 0
88
101
  version: "0"
89
- version:
90
102
  requirements: []
91
103
 
92
104
  rubyforge_project: curb
93
- rubygems_version: 1.3.5
105
+ rubygems_version: 1.3.7
94
106
  signing_key:
95
107
  specification_version: 3
96
108
  summary: Ruby libcurl bindings