treat 2.0.4 → 2.0.5
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/README.md +7 -0
- data/files/21552208.html +797 -0
- data/files/nethttp-cheat-sheet-2940.html +393 -0
- data/lib/treat/config/data/core.rb +1 -1
- data/lib/treat/config/data/languages/english.rb +1 -1
- data/lib/treat/config/data/languages/german.rb +2 -0
- data/lib/treat/config/data/libraries.rb +4 -0
- data/lib/treat/entities/entity/buildable.rb +4 -6
- data/lib/treat/helpers/string.rb +1 -1
- data/lib/treat/loaders/bind_it.rb +48 -0
- data/lib/treat/loaders/open_nlp.rb +12 -0
- data/lib/treat/loaders/stanford.rb +7 -46
- data/lib/treat/proxies/proxy.rb +3 -2
- data/lib/treat/version.rb +1 -1
- data/lib/treat/workers/formatters/readers/autoselect.rb +1 -1
- data/lib/treat/workers/formatters/readers/document.rb +17 -0
- data/lib/treat/workers/formatters/unserializers/xml.rb +1 -0
- data/lib/treat/workers/groupable.rb +1 -1
- data/lib/treat/workers/processors/tokenizers/{maxent.rb → open_nlp.rb} +5 -4
- data/spec/entities/document.rb +2 -2
- data/spec/helper.rb +4 -0
- data/spec/sandbox.rb +306 -0
- data/spec/workers/examples/english/mathematicians/euler.html +21 -0
- data/spec/workers/examples/english/mathematicians/pythagoras.docx +0 -0
- metadata +28 -4
data/lib/treat/helpers/string.rb
CHANGED
@@ -54,7 +54,7 @@ class Treat::Helpers::String
|
|
54
54
|
if @@cc_cache[o_phrase]
|
55
55
|
return @@cc_cache[o_phrase]
|
56
56
|
end
|
57
|
-
if Treat.core.acronyms.include?(phrase)
|
57
|
+
if Treat.core.acronyms.include?(phrase.downcase)
|
58
58
|
phrase = phrase.upcase
|
59
59
|
else
|
60
60
|
phrase.gsub!(Regex) { |a| a.upcase }
|
@@ -0,0 +1,48 @@
|
|
1
|
+
class Treat::Loaders::BindIt
|
2
|
+
|
3
|
+
# Keep track of whether its loaded or not.
|
4
|
+
@@loaded = {}
|
5
|
+
|
6
|
+
# Load CoreNLP package for a given language.
|
7
|
+
def self.load(klass, name, language = nil)
|
8
|
+
|
9
|
+
return if @@loaded[klass]
|
10
|
+
|
11
|
+
language ||= Treat.core.language.default
|
12
|
+
|
13
|
+
jar_path = Treat.libraries[name].jar_path ||
|
14
|
+
Treat.paths.bin + "#{name}/"
|
15
|
+
model_path = Treat.libraries[name].model_path ||
|
16
|
+
Treat.paths.models + "#{name}/"
|
17
|
+
|
18
|
+
if !File.directory?(jar_path)
|
19
|
+
raise Treat::Exception, "Looking for #{klass} " +
|
20
|
+
"library JAR files in #{jar_path}, but it is " +
|
21
|
+
"not a directory. Please set the config option " +
|
22
|
+
"Treat.libraries.#{name}.jar_path to a folder " +
|
23
|
+
"containing the appropriate JAR files."
|
24
|
+
end
|
25
|
+
|
26
|
+
if !File.directory?(model_path)
|
27
|
+
raise Treat::Exception, "Looking for #{klass} " +
|
28
|
+
"library model files in #{model_path}, but it " +
|
29
|
+
"is not a directory. Please set the config option " +
|
30
|
+
"Treat.libraries.#{name}.model_path to a folder " +
|
31
|
+
"containing the appropriate JAR files."
|
32
|
+
end
|
33
|
+
|
34
|
+
klass.jar_path = jar_path
|
35
|
+
klass.model_path = model_path
|
36
|
+
klass.use language
|
37
|
+
|
38
|
+
if Treat.core.verbosity.silence
|
39
|
+
klass.log_file = '/dev/null'
|
40
|
+
end
|
41
|
+
|
42
|
+
klass.bind
|
43
|
+
|
44
|
+
@@loaded[klass] = true
|
45
|
+
|
46
|
+
end
|
47
|
+
|
48
|
+
end
|
@@ -1,53 +1,14 @@
|
|
1
|
+
require 'treat/loaders/bind_it'
|
2
|
+
|
1
3
|
# A helper class to load the CoreNLP package.
|
2
|
-
class Treat::Loaders::Stanford
|
4
|
+
class Treat::Loaders::Stanford < Treat::Loaders::BindIt
|
3
5
|
|
4
|
-
|
5
|
-
@@loaded = false
|
6
|
+
require 'stanford-core-nlp'
|
6
7
|
|
7
|
-
# Load CoreNLP package for a given language.
|
8
8
|
def self.load(language = nil)
|
9
|
-
|
10
|
-
return if @@loaded
|
11
|
-
|
12
|
-
language ||= Treat.core.language.default
|
13
|
-
|
14
|
-
jar_path = Treat.libraries.stanford.jar_path ||
|
15
|
-
Treat.paths.bin + 'stanford/'
|
16
|
-
model_path = Treat.libraries.stanford.model_path ||
|
17
|
-
Treat.paths.models + 'stanford/'
|
18
|
-
|
19
|
-
if !File.directory?(jar_path)
|
20
|
-
raise Treat::Exception, "Looking for Stanford " +
|
21
|
-
"CoreNLP JAR files in #{jar_path}, but it is " +
|
22
|
-
"not a directory. Please set the config option " +
|
23
|
-
"Treat.libraries.stanford.jar_path to a folder " +
|
24
|
-
"containing the Stanford JAR files."
|
25
|
-
end
|
26
|
-
|
27
|
-
if !File.directory?(model_path)
|
28
|
-
raise Treat::Exception, "Looking for Stanford " +
|
29
|
-
"CoreNLP model files in #{model_path}, but it " +
|
30
|
-
"is not a directory. Please set the config option " +
|
31
|
-
"Treat.libraries.stanford.model_path to a folder " +
|
32
|
-
"containing the Stanford JAR files."
|
33
|
-
end
|
34
|
-
|
35
|
-
require 'stanford-core-nlp'
|
36
|
-
|
37
|
-
StanfordCoreNLP.jar_path = jar_path
|
38
|
-
StanfordCoreNLP.model_path = model_path
|
39
|
-
StanfordCoreNLP.use(language)
|
40
|
-
|
41
|
-
if Treat.core.verbosity.silence
|
42
|
-
StanfordCoreNLP.log_file = '/dev/null'
|
43
|
-
end
|
44
|
-
|
45
|
-
StanfordCoreNLP.bind
|
46
|
-
|
47
|
-
@@loaded = true
|
48
|
-
|
9
|
+
super(StanfordCoreNLP, :stanford, language)
|
49
10
|
end
|
50
|
-
|
11
|
+
|
51
12
|
def self.find_model(name, language)
|
52
13
|
language = language.intern
|
53
14
|
model_file = StanfordCoreNLP::Config::Models[name][language]
|
@@ -57,4 +18,4 @@ class Treat::Loaders::Stanford
|
|
57
18
|
File.join(model_path, model_dir, model_file)
|
58
19
|
end
|
59
20
|
|
60
|
-
end
|
21
|
+
end
|
data/lib/treat/proxies/proxy.rb
CHANGED
@@ -11,14 +11,15 @@ module Treat::Proxies
|
|
11
11
|
def method_missing(sym, *args, &block)
|
12
12
|
if [:do, :apply].include?(sym) ||
|
13
13
|
Treat::Workers.lookup(sym)
|
14
|
-
|
14
|
+
to_entity.send(sym, *args)
|
15
15
|
else
|
16
16
|
super(sym, *args, &block)
|
17
17
|
end
|
18
18
|
end
|
19
|
+
|
19
20
|
# Create an unknown type of entity by default.
|
20
21
|
def to_entity(builder = nil)
|
21
|
-
Treat::Entities::Unknown(self.to_s)
|
22
|
+
Treat::Entities::Unknown.new(self.to_s)
|
22
23
|
end
|
23
24
|
end
|
24
25
|
|
data/lib/treat/version.rb
CHANGED
@@ -0,0 +1,17 @@
|
|
1
|
+
require 'yomu'
|
2
|
+
|
3
|
+
# This class is a wrapper for Yomu.
|
4
|
+
# Yomu is a library for extracting text and metadata from files and documents
|
5
|
+
# using the Apache Tika content analysis toolkit.
|
6
|
+
class Treat::Workers::Formatters::Readers::Document
|
7
|
+
# Extract the readable text from any document.
|
8
|
+
#
|
9
|
+
# Options: none.
|
10
|
+
def self.read(document, options = {})
|
11
|
+
yomu = Yomu.new(document.file)
|
12
|
+
|
13
|
+
document.value = yomu.text
|
14
|
+
document.set :format, yomu.mimetype.extensions.first
|
15
|
+
document
|
16
|
+
end
|
17
|
+
end
|
@@ -1,8 +1,10 @@
|
|
1
1
|
# Maximum entropy tokenization supplied by OpenNLP.
|
2
|
-
class Treat::Workers::Processors::Tokenizers::
|
2
|
+
class Treat::Workers::Processors::Tokenizers::OpenNlp
|
3
3
|
|
4
4
|
require 'open-nlp'
|
5
|
-
OpenNLP.load
|
5
|
+
Treat::Loaders::OpenNLP.load
|
6
|
+
|
7
|
+
@@tokenizers = {}
|
6
8
|
|
7
9
|
# Maximum entropy tokenization.
|
8
10
|
def self.tokenize(entity, options = {})
|
@@ -20,8 +22,7 @@ class Treat::Workers::Processors::Tokenizers::Maxent
|
|
20
22
|
tokens = tokenizer.tokenize(str).to_a
|
21
23
|
|
22
24
|
tokens.each do |token|
|
23
|
-
entity << Treat::Entities
|
24
|
-
::Token.from_string(chunk)
|
25
|
+
entity << Treat::Entities::Token.from_string(token)
|
25
26
|
end
|
26
27
|
|
27
28
|
end
|
data/spec/entities/document.rb
CHANGED
@@ -9,10 +9,10 @@ module Treat::Specs::Entities
|
|
9
9
|
it "opens the file and reads its " +
|
10
10
|
"content into a document" do
|
11
11
|
f = Treat.paths.spec +
|
12
|
-
'workers/examples/english/mathematicians/
|
12
|
+
'workers/examples/english/mathematicians/pythagoras.docx'
|
13
13
|
d = Treat::Entities::Document.build(f)
|
14
14
|
d.should be_an_instance_of Treat::Entities::Document
|
15
|
-
d.to_s.index('
|
15
|
+
d.to_s.index('Pythagoras of Samos').should_not eql nil
|
16
16
|
end
|
17
17
|
end
|
18
18
|
|
data/spec/helper.rb
CHANGED
@@ -13,6 +13,10 @@ module Treat::Specs
|
|
13
13
|
'/ruby/stanford-core-nlp-minimal/models/'
|
14
14
|
Treat.libraries.stanford.jar_path =
|
15
15
|
'/ruby/stanford-core-nlp-minimal/bin/'
|
16
|
+
Treat.libraries.open_nlp.jar_path =
|
17
|
+
'/ruby/open-nlp-english/bin/'
|
18
|
+
Treat.libraries.open_nlp.model_path =
|
19
|
+
'/ruby/open-nlp-english/models/'
|
16
20
|
Treat.libraries.punkt.model_path =
|
17
21
|
'/ruby/punkt/models/'
|
18
22
|
Treat.libraries.reuters.model_path =
|
data/spec/sandbox.rb
ADDED
@@ -0,0 +1,306 @@
|
|
1
|
+
# encoding: utf-8
|
2
|
+
require_relative '../lib/treat'
|
3
|
+
|
4
|
+
Treat.databases.mongo.db = 'treat_test'
|
5
|
+
Treat.libraries.stanford.model_path =
|
6
|
+
'/ruby/stanford-core-nlp-minimal/models/'
|
7
|
+
Treat.libraries.stanford.jar_path =
|
8
|
+
'/ruby/stanford-core-nlp-minimal/bin/'
|
9
|
+
Treat.libraries.punkt.model_path =
|
10
|
+
'/ruby/punkt/models/'
|
11
|
+
Treat.libraries.reuters.model_path =
|
12
|
+
'/ruby/reuters/models/'
|
13
|
+
Treat.libraries.open_nlp.jar_path =
|
14
|
+
'/ruby/open-nlp-english/bin/'
|
15
|
+
Treat.libraries.open_nlp.model_path =
|
16
|
+
'/ruby/open-nlp-english/models/'
|
17
|
+
Treat.core.verbosity.silence = false
|
18
|
+
|
19
|
+
include Treat::Core::DSL
|
20
|
+
|
21
|
+
s = sentence "This is a sentence to parse!"
|
22
|
+
s.tokenize(:open_nlp).parse
|
23
|
+
s.print_tree
|
24
|
+
|
25
|
+
=begin
|
26
|
+
Treat::Builder.new do
|
27
|
+
p = phrase "26 Feb"
|
28
|
+
p.tokenize.time :kronic
|
29
|
+
puts p.inspect
|
30
|
+
s = sentence "Hello, world!"
|
31
|
+
s2 = sentence "Hello world"
|
32
|
+
puts s.similarity :jaro_winkler, to: s2
|
33
|
+
puts s.distance :levenshtein, to: s2
|
34
|
+
# puts s.similarity :tf_idf, to: s2
|
35
|
+
end
|
36
|
+
|
37
|
+
g = group("I was running")
|
38
|
+
puts g.tag.inspect
|
39
|
+
|
40
|
+
Treat.libraries.stanford.jar_path = '/ruby/treat/bin/'
|
41
|
+
Treat.libraries.stanford.model_path = '/ruby/treat/models/'
|
42
|
+
|
43
|
+
p = paragraph
|
44
|
+
s = sentence
|
45
|
+
w = word
|
46
|
+
|
47
|
+
p = phrase 'hello world'
|
48
|
+
e = email 'louis@gmail.com'
|
49
|
+
|
50
|
+
d = question(:is_feature, :word)
|
51
|
+
=end
|
52
|
+
#d = document Treat.paths.spec + 'workers/examples/english/economist/hungarys_troubles.txt'
|
53
|
+
#d.apply :chunk, :segment, :tokenize, :tag, :category, :name_tag
|
54
|
+
#d.print_tree
|
55
|
+
#d = document Treat.paths.spec + 'workers/examples/english/economist/saving_the_euro.odt'
|
56
|
+
#d.print_tree
|
57
|
+
=begin
|
58
|
+
d = document 'test.htm'
|
59
|
+
d.apply :chunk
|
60
|
+
#d.serialize :yaml, file: 'test444.yaml'
|
61
|
+
d.set :test, 2
|
62
|
+
d.serialize :mongo, db: 'test'
|
63
|
+
d.set :test, 3
|
64
|
+
d.serialize :mongo, db: 'test'
|
65
|
+
d.apply :segment, :tokenize, :tag, :category
|
66
|
+
puts d.verb_count
|
67
|
+
#d2 = document id: d.id, db: 'test'
|
68
|
+
d2 = document 'features.test' => 3, db: 'test'
|
69
|
+
d2.apply :segment, :tokenize, :tag, :category
|
70
|
+
puts d2.verb_count
|
71
|
+
#d.print_tree
|
72
|
+
#s = document 'http://www.economist.com'
|
73
|
+
|
74
|
+
p = phrase 'hello', 'world', '!'
|
75
|
+
puts p.to_s
|
76
|
+
puts p.to_str
|
77
|
+
=end
|
78
|
+
|
79
|
+
=begin
|
80
|
+
### Super basics.
|
81
|
+
puts p.value
|
82
|
+
|
83
|
+
p << 'bitch'
|
84
|
+
p << word('hello')
|
85
|
+
puts p.to_s
|
86
|
+
puts p.to_str
|
87
|
+
puts p.value
|
88
|
+
puts p.to_ary.inspect
|
89
|
+
=end
|
90
|
+
|
91
|
+
=begin
|
92
|
+
|
93
|
+
### Configuration
|
94
|
+
|
95
|
+
# A boolean value indicating whether to silence the output of external libraries (e.g. Stanford tools, Enju, LDA, Ruby-FANN) when they are used.
|
96
|
+
puts Treat.core.verbosity.silence
|
97
|
+
# A boolean value indicating whether to explain the steps that Treat is performing.
|
98
|
+
puts Treat.core.verbosity.debug
|
99
|
+
# A boolean value indicating whether Treat should try to detect the language of newly input text.
|
100
|
+
puts Treat.core.language.detect
|
101
|
+
# The language to default to when detection is off.
|
102
|
+
puts Treat.core.language.default
|
103
|
+
# A symbol representing the finest level at which language detection should be performed if language detection is turned on.
|
104
|
+
puts Treat.core.language.detect_at
|
105
|
+
|
106
|
+
# A directory in which to create temporary files.
|
107
|
+
puts Treat.paths.tmp
|
108
|
+
# A directory in which to store downloaded files.
|
109
|
+
puts Treat.paths.files
|
110
|
+
# A directory containing trained models for various tasks.
|
111
|
+
puts Treat.paths.models
|
112
|
+
# A directory containing the spec files.
|
113
|
+
puts Treat.paths.spec
|
114
|
+
# A directory containing executables and JAR files.
|
115
|
+
puts Treat.paths.bin
|
116
|
+
puts Treat.paths.lib
|
117
|
+
|
118
|
+
# Set up Mongoid.
|
119
|
+
Treat.databases.mongo.db = 'your_database'
|
120
|
+
Treat.databases.mongo.host = 'localhost'
|
121
|
+
Treat.databases.mongo.port = '27017'
|
122
|
+
|
123
|
+
# Transparent string casting.
|
124
|
+
s = 'inflection'.stem
|
125
|
+
# is equivalent to
|
126
|
+
s = 'inflection'.to_entity.stem
|
127
|
+
# which comes down to
|
128
|
+
s = word('inflection').stem
|
129
|
+
|
130
|
+
# Transparent number casting.
|
131
|
+
n = 2.ordinal
|
132
|
+
# is equivalent to
|
133
|
+
s = 2.to_entity.ordinal
|
134
|
+
# which comes down to
|
135
|
+
s = number(2).ordinal
|
136
|
+
=end
|
137
|
+
=begin
|
138
|
+
### BASIC USAGE
|
139
|
+
|
140
|
+
# Create a sentence
|
141
|
+
s = sentence 'Those who dream by day know of at least ' +
|
142
|
+
'19 things that escape those who dream only at night.'
|
143
|
+
|
144
|
+
# Tokenize and tag it.
|
145
|
+
s.tokenize.tag
|
146
|
+
|
147
|
+
# View the sentence structure.
|
148
|
+
s.print_tree
|
149
|
+
|
150
|
+
# Iterate over the tokens.
|
151
|
+
s.each_token do |tok|
|
152
|
+
puts tok.value
|
153
|
+
puts tok.type
|
154
|
+
end
|
155
|
+
|
156
|
+
|
157
|
+
|
158
|
+
# Arrays instead of iterators.
|
159
|
+
(s.nouns + s.adjectives).each do |word|
|
160
|
+
puts word.synonyms
|
161
|
+
puts word.antonyms
|
162
|
+
end
|
163
|
+
|
164
|
+
# Functions on numbers.
|
165
|
+
s.each_number do |num|
|
166
|
+
puts num.ordinal
|
167
|
+
puts num.cardinal
|
168
|
+
end
|
169
|
+
|
170
|
+
# See all the annotations.
|
171
|
+
s.each do |tok|
|
172
|
+
puts tok.inspect
|
173
|
+
end
|
174
|
+
|
175
|
+
# Lazy way of doing all of the above.
|
176
|
+
s = sentence 'Those who dream by day know of at least ' +
|
177
|
+
'19 things that escape those who dream only at night.'
|
178
|
+
|
179
|
+
s.apply :tokenize, :tag, :category,
|
180
|
+
:stem, :hyponyms, :hypernyms,
|
181
|
+
:antonyms, :ordinal, :cardinal
|
182
|
+
|
183
|
+
=end
|
184
|
+
|
185
|
+
=begin
|
186
|
+
### A BIT MORE ADVANCED USAGE
|
187
|
+
|
188
|
+
section = section "Obama-Sarkozy Meeting\n" +
|
189
|
+
"Obama and Sarkozy met on January 1st to investigate " +
|
190
|
+
"the possibility of a new rescue plan. President " +
|
191
|
+
"Sarkozy is to meet Merkel next Tuesday in Berlin."
|
192
|
+
|
193
|
+
# Chunk: split the titles and paragraphs.
|
194
|
+
# Segment: perform sentence segmentation.
|
195
|
+
# Parse: parse the syntax of each sentence.
|
196
|
+
section.apply :chunk, :segment, :parse
|
197
|
+
|
198
|
+
# View the tree structure.
|
199
|
+
section.print_tree
|
200
|
+
|
201
|
+
# Get some basic info on the text.
|
202
|
+
puts section.title
|
203
|
+
puts section.sentence_count
|
204
|
+
puts section.word_count
|
205
|
+
|
206
|
+
section.apply :category
|
207
|
+
puts section.noun_count
|
208
|
+
puts section.frequency_of 'president'
|
209
|
+
|
210
|
+
section.each_phrase_with_tag('NP') do |phrase|
|
211
|
+
puts phrase.to_s
|
212
|
+
end
|
213
|
+
|
214
|
+
=end
|
215
|
+
=begin
|
216
|
+
### URL documents, XML serialization.
|
217
|
+
|
218
|
+
urls = ['http://www.cbc.ca/news/world/story/2012/11/25/snc-lavalin-ben-aissa-charges.html',
|
219
|
+
'http://www.cbc.ca/news/world/story/2012/11/25/egypt.html', 'http://www.cbc.ca/news/canada/prince-edward-island/story/2012/11/25/pei-murder-arrest-stlucia.html', 'http://www.cbc.ca/news/world/story/2012/11/25/bangladesh-garment-factory-fire.html']
|
220
|
+
|
221
|
+
c = collection
|
222
|
+
urls.each { |url| c << document(url) }
|
223
|
+
|
224
|
+
# View the collection.
|
225
|
+
c.print_tree
|
226
|
+
|
227
|
+
c.apply :chunk, :segment, :tokenize
|
228
|
+
c.serialize :xml, :file => 'test.xml'
|
229
|
+
|
230
|
+
# Reopen the collection.
|
231
|
+
c = collection 'test.xml'
|
232
|
+
|
233
|
+
# View it again.
|
234
|
+
c.print_tree
|
235
|
+
=end
|
236
|
+
=begin
|
237
|
+
include Treat::Core::DSL
|
238
|
+
|
239
|
+
# Show progress bars for download.
|
240
|
+
Treat.core.verbosity.silence = false
|
241
|
+
# Explain what Treat is doing.
|
242
|
+
Treat.core.verbosity.debug = true
|
243
|
+
|
244
|
+
# Define the question "is it junk?" on sentences.
|
245
|
+
qn = question(:is_junk, :sentence)
|
246
|
+
|
247
|
+
# Frame the problem as depending on punctuation
|
248
|
+
# count and word count for each sentence.
|
249
|
+
pb = problem(qn,
|
250
|
+
feature(:punctuation_count),
|
251
|
+
feature(:word_count) )
|
252
|
+
|
253
|
+
# Get some web documents to work on.
|
254
|
+
url1 = 'http://en.wikipedia.org/wiki/NOD_mouse'
|
255
|
+
url2 = 'http://en.wikipedia.org/wiki/Academic_studies_about_Wikipedia'
|
256
|
+
d1, d2 = document(url1), document(url2)
|
257
|
+
|
258
|
+
# Process both of our documents.
|
259
|
+
[d1,d2].apply(:chunk, :segment, :tokenize)
|
260
|
+
|
261
|
+
# Answer our problem to create a training set.
|
262
|
+
d1.sentences[0..17].each { |s| s.set :is_junk, 0 }
|
263
|
+
d1.sentences[17..-1].each { |s| s.set :is_junk, 1 }
|
264
|
+
d_set = d1.export(pb)
|
265
|
+
|
266
|
+
# Define our gold standard results for evaluation.
|
267
|
+
d2.sentences[0..81].each { |s| s.set :is_true_junk, 0 }
|
268
|
+
d2.sentences[81..-1].each { |s| s.set :is_true_junk, 1 }
|
269
|
+
|
270
|
+
tp, fp, tn, fn = 0.0, 0.0, 0.0, 0.0
|
271
|
+
|
272
|
+
d2.sentences.map do |s|
|
273
|
+
pred = s.classify(:id3, training: d_set)
|
274
|
+
if pred == 1
|
275
|
+
tp += 1 if s.is_true_junk == 1
|
276
|
+
fp += 1 if s.is_true_junk == 0
|
277
|
+
else
|
278
|
+
tn += 1 if s.is_true_junk == 0
|
279
|
+
fn += 1 if s.is_true_junk == 1
|
280
|
+
end
|
281
|
+
end
|
282
|
+
|
283
|
+
puts "Precision: #{tp/(tp + fp)}"
|
284
|
+
puts "Recall: #{tp/(tp + fn)}"
|
285
|
+
=end
|
286
|
+
=begin
|
287
|
+
d = document 'http://louismullie.com/susan-text-scan1.jpg'
|
288
|
+
d.apply :chunk, :segment, :tokenize
|
289
|
+
d.print_tree
|
290
|
+
=end
|
291
|
+
=begin
|
292
|
+
# Syntax example
|
293
|
+
phra = phrase 'Obama', 'Sarkozy', 'Meeting'
|
294
|
+
|
295
|
+
para = paragraph 'Obama and Sarkozy met on January 1st to'
|
296
|
+
'investigate the possibility of a new rescue plan. Nicolas ' +
|
297
|
+
'Sarkozy is to meet Merkel next Tuesday in Berlin.'
|
298
|
+
|
299
|
+
sect = section title(phra), para
|
300
|
+
=end
|
301
|
+
=begin
|
302
|
+
puts "beer".plural.inspect
|
303
|
+
=end
|
304
|
+
# Treat.core.language.detect = true
|
305
|
+
# s = sentence "Du hast deiner Frau einen roten Ring gekauft."
|
306
|
+
#s.apply(:parse,:category).print_tree
|