treat 2.0.4 → 2.0.5

Sign up to get free protection for your applications and to get access to all the features.
@@ -54,7 +54,7 @@ class Treat::Helpers::String
54
54
  if @@cc_cache[o_phrase]
55
55
  return @@cc_cache[o_phrase]
56
56
  end
57
- if Treat.core.acronyms.include?(phrase)
57
+ if Treat.core.acronyms.include?(phrase.downcase)
58
58
  phrase = phrase.upcase
59
59
  else
60
60
  phrase.gsub!(Regex) { |a| a.upcase }
@@ -0,0 +1,48 @@
1
+ class Treat::Loaders::BindIt
2
+
3
+ # Keep track of whether its loaded or not.
4
+ @@loaded = {}
5
+
6
+ # Load CoreNLP package for a given language.
7
+ def self.load(klass, name, language = nil)
8
+
9
+ return if @@loaded[klass]
10
+
11
+ language ||= Treat.core.language.default
12
+
13
+ jar_path = Treat.libraries[name].jar_path ||
14
+ Treat.paths.bin + "#{name}/"
15
+ model_path = Treat.libraries[name].model_path ||
16
+ Treat.paths.models + "#{name}/"
17
+
18
+ if !File.directory?(jar_path)
19
+ raise Treat::Exception, "Looking for #{klass} " +
20
+ "library JAR files in #{jar_path}, but it is " +
21
+ "not a directory. Please set the config option " +
22
+ "Treat.libraries.#{name}.jar_path to a folder " +
23
+ "containing the appropriate JAR files."
24
+ end
25
+
26
+ if !File.directory?(model_path)
27
+ raise Treat::Exception, "Looking for #{klass} " +
28
+ "library model files in #{model_path}, but it " +
29
+ "is not a directory. Please set the config option " +
30
+ "Treat.libraries.#{name}.model_path to a folder " +
31
+ "containing the appropriate JAR files."
32
+ end
33
+
34
+ klass.jar_path = jar_path
35
+ klass.model_path = model_path
36
+ klass.use language
37
+
38
+ if Treat.core.verbosity.silence
39
+ klass.log_file = '/dev/null'
40
+ end
41
+
42
+ klass.bind
43
+
44
+ @@loaded[klass] = true
45
+
46
+ end
47
+
48
+ end
@@ -0,0 +1,12 @@
1
+ require 'treat/loaders/bind_it'
2
+
3
+ # A helper class to load the OpenNLP package.
4
+ class Treat::Loaders::OpenNLP < Treat::Loaders::BindIt
5
+
6
+ require 'open-nlp'
7
+
8
+ def self.load(language = nil)
9
+ super(OpenNLP, :open_nlp, language)
10
+ end
11
+
12
+ end
@@ -1,53 +1,14 @@
1
+ require 'treat/loaders/bind_it'
2
+
1
3
  # A helper class to load the CoreNLP package.
2
- class Treat::Loaders::Stanford
4
+ class Treat::Loaders::Stanford < Treat::Loaders::BindIt
3
5
 
4
- # Keep track of whether its loaded or not.
5
- @@loaded = false
6
+ require 'stanford-core-nlp'
6
7
 
7
- # Load CoreNLP package for a given language.
8
8
  def self.load(language = nil)
9
-
10
- return if @@loaded
11
-
12
- language ||= Treat.core.language.default
13
-
14
- jar_path = Treat.libraries.stanford.jar_path ||
15
- Treat.paths.bin + 'stanford/'
16
- model_path = Treat.libraries.stanford.model_path ||
17
- Treat.paths.models + 'stanford/'
18
-
19
- if !File.directory?(jar_path)
20
- raise Treat::Exception, "Looking for Stanford " +
21
- "CoreNLP JAR files in #{jar_path}, but it is " +
22
- "not a directory. Please set the config option " +
23
- "Treat.libraries.stanford.jar_path to a folder " +
24
- "containing the Stanford JAR files."
25
- end
26
-
27
- if !File.directory?(model_path)
28
- raise Treat::Exception, "Looking for Stanford " +
29
- "CoreNLP model files in #{model_path}, but it " +
30
- "is not a directory. Please set the config option " +
31
- "Treat.libraries.stanford.model_path to a folder " +
32
- "containing the Stanford JAR files."
33
- end
34
-
35
- require 'stanford-core-nlp'
36
-
37
- StanfordCoreNLP.jar_path = jar_path
38
- StanfordCoreNLP.model_path = model_path
39
- StanfordCoreNLP.use(language)
40
-
41
- if Treat.core.verbosity.silence
42
- StanfordCoreNLP.log_file = '/dev/null'
43
- end
44
-
45
- StanfordCoreNLP.bind
46
-
47
- @@loaded = true
48
-
9
+ super(StanfordCoreNLP, :stanford, language)
49
10
  end
50
-
11
+
51
12
  def self.find_model(name, language)
52
13
  language = language.intern
53
14
  model_file = StanfordCoreNLP::Config::Models[name][language]
@@ -57,4 +18,4 @@ class Treat::Loaders::Stanford
57
18
  File.join(model_path, model_dir, model_file)
58
19
  end
59
20
 
60
- end
21
+ end
@@ -11,14 +11,15 @@ module Treat::Proxies
11
11
  def method_missing(sym, *args, &block)
12
12
  if [:do, :apply].include?(sym) ||
13
13
  Treat::Workers.lookup(sym)
14
- to_entity.send(sym, *args)
14
+ to_entity.send(sym, *args)
15
15
  else
16
16
  super(sym, *args, &block)
17
17
  end
18
18
  end
19
+
19
20
  # Create an unknown type of entity by default.
20
21
  def to_entity(builder = nil)
21
- Treat::Entities::Unknown(self.to_s)
22
+ Treat::Entities::Unknown.new(self.to_s)
22
23
  end
23
24
  end
24
25
 
data/lib/treat/version.rb CHANGED
@@ -1,7 +1,7 @@
1
1
  module Treat
2
2
 
3
3
  # The current version of Treat.
4
- VERSION = "2.0.4"
4
+ VERSION = "2.0.5"
5
5
 
6
6
  # Treat requires Ruby >= 1.9.2
7
7
  if RUBY_VERSION < '1.9.2'
@@ -3,7 +3,7 @@ class Treat::Workers::Formatters::Readers::Autoselect
3
3
  ExtensionRegexp = /^.*?\.([a-zA-Z0-9]{2,5})$/
4
4
  ImageExtensions = ['gif', 'jpg', 'jpeg', 'png']
5
5
  DefaultOptions = {
6
- :default_to => 'txt'
6
+ :default_to => 'document'
7
7
  }
8
8
 
9
9
  # Choose a reader to use.
@@ -0,0 +1,17 @@
1
+ require 'yomu'
2
+
3
+ # This class is a wrapper for Yomu.
4
+ # Yomu is a library for extracting text and metadata from files and documents
5
+ # using the Apache Tika content analysis toolkit.
6
+ class Treat::Workers::Formatters::Readers::Document
7
+ # Extract the readable text from any document.
8
+ #
9
+ # Options: none.
10
+ def self.read(document, options = {})
11
+ yomu = Yomu.new(document.file)
12
+
13
+ document.value = yomu.text
14
+ document.set :format, yomu.mimetype.extensions.first
15
+ document
16
+ end
17
+ end
@@ -65,6 +65,7 @@ class Treat::Workers::Formatters::Unserializers::XML
65
65
  value = v
66
66
  else
67
67
  v = v[1..-1].intern if v[0] == ':'
68
+ v = ":".intern if v == :''
68
69
  v = v.to_i if v =~ /^[0-9]*$/
69
70
  v = v.to_f if v =~ /^[0-9\.]*$/
70
71
  v = false if v == 'false'
@@ -15,7 +15,7 @@ module Treat::Workers::Groupable
15
15
  require file
16
16
  if not self.const_defined?(const)
17
17
  raise Treat::Exception,
18
- "File #{file} does not define " +
18
+ "File #{file}.rb does not define " +
19
19
  "#{self}::#{const}."
20
20
  end
21
21
  const_get(const)
@@ -1,8 +1,10 @@
1
1
  # Maximum entropy tokenization supplied by OpenNLP.
2
- class Treat::Workers::Processors::Tokenizers::Maxent
2
+ class Treat::Workers::Processors::Tokenizers::OpenNlp
3
3
 
4
4
  require 'open-nlp'
5
- OpenNLP.load
5
+ Treat::Loaders::OpenNLP.load
6
+
7
+ @@tokenizers = {}
6
8
 
7
9
  # Maximum entropy tokenization.
8
10
  def self.tokenize(entity, options = {})
@@ -20,8 +22,7 @@ class Treat::Workers::Processors::Tokenizers::Maxent
20
22
  tokens = tokenizer.tokenize(str).to_a
21
23
 
22
24
  tokens.each do |token|
23
- entity << Treat::Entities
24
- ::Token.from_string(chunk)
25
+ entity << Treat::Entities::Token.from_string(token)
25
26
  end
26
27
 
27
28
  end
@@ -9,10 +9,10 @@ module Treat::Specs::Entities
9
9
  it "opens the file and reads its " +
10
10
  "content into a document" do
11
11
  f = Treat.paths.spec +
12
- 'workers/examples/english/mathematicians/leibniz.txt'
12
+ 'workers/examples/english/mathematicians/pythagoras.docx'
13
13
  d = Treat::Entities::Document.build(f)
14
14
  d.should be_an_instance_of Treat::Entities::Document
15
- d.to_s.index('Gottfried Leibniz').should_not eql nil
15
+ d.to_s.index('Pythagoras of Samos').should_not eql nil
16
16
  end
17
17
  end
18
18
 
data/spec/helper.rb CHANGED
@@ -13,6 +13,10 @@ module Treat::Specs
13
13
  '/ruby/stanford-core-nlp-minimal/models/'
14
14
  Treat.libraries.stanford.jar_path =
15
15
  '/ruby/stanford-core-nlp-minimal/bin/'
16
+ Treat.libraries.open_nlp.jar_path =
17
+ '/ruby/open-nlp-english/bin/'
18
+ Treat.libraries.open_nlp.model_path =
19
+ '/ruby/open-nlp-english/models/'
16
20
  Treat.libraries.punkt.model_path =
17
21
  '/ruby/punkt/models/'
18
22
  Treat.libraries.reuters.model_path =
data/spec/sandbox.rb ADDED
@@ -0,0 +1,306 @@
1
+ # encoding: utf-8
2
+ require_relative '../lib/treat'
3
+
4
+ Treat.databases.mongo.db = 'treat_test'
5
+ Treat.libraries.stanford.model_path =
6
+ '/ruby/stanford-core-nlp-minimal/models/'
7
+ Treat.libraries.stanford.jar_path =
8
+ '/ruby/stanford-core-nlp-minimal/bin/'
9
+ Treat.libraries.punkt.model_path =
10
+ '/ruby/punkt/models/'
11
+ Treat.libraries.reuters.model_path =
12
+ '/ruby/reuters/models/'
13
+ Treat.libraries.open_nlp.jar_path =
14
+ '/ruby/open-nlp-english/bin/'
15
+ Treat.libraries.open_nlp.model_path =
16
+ '/ruby/open-nlp-english/models/'
17
+ Treat.core.verbosity.silence = false
18
+
19
+ include Treat::Core::DSL
20
+
21
+ s = sentence "This is a sentence to parse!"
22
+ s.tokenize(:open_nlp).parse
23
+ s.print_tree
24
+
25
+ =begin
26
+ Treat::Builder.new do
27
+ p = phrase "26 Feb"
28
+ p.tokenize.time :kronic
29
+ puts p.inspect
30
+ s = sentence "Hello, world!"
31
+ s2 = sentence "Hello world"
32
+ puts s.similarity :jaro_winkler, to: s2
33
+ puts s.distance :levenshtein, to: s2
34
+ # puts s.similarity :tf_idf, to: s2
35
+ end
36
+
37
+ g = group("I was running")
38
+ puts g.tag.inspect
39
+
40
+ Treat.libraries.stanford.jar_path = '/ruby/treat/bin/'
41
+ Treat.libraries.stanford.model_path = '/ruby/treat/models/'
42
+
43
+ p = paragraph
44
+ s = sentence
45
+ w = word
46
+
47
+ p = phrase 'hello world'
48
+ e = email 'louis@gmail.com'
49
+
50
+ d = question(:is_feature, :word)
51
+ =end
52
+ #d = document Treat.paths.spec + 'workers/examples/english/economist/hungarys_troubles.txt'
53
+ #d.apply :chunk, :segment, :tokenize, :tag, :category, :name_tag
54
+ #d.print_tree
55
+ #d = document Treat.paths.spec + 'workers/examples/english/economist/saving_the_euro.odt'
56
+ #d.print_tree
57
+ =begin
58
+ d = document 'test.htm'
59
+ d.apply :chunk
60
+ #d.serialize :yaml, file: 'test444.yaml'
61
+ d.set :test, 2
62
+ d.serialize :mongo, db: 'test'
63
+ d.set :test, 3
64
+ d.serialize :mongo, db: 'test'
65
+ d.apply :segment, :tokenize, :tag, :category
66
+ puts d.verb_count
67
+ #d2 = document id: d.id, db: 'test'
68
+ d2 = document 'features.test' => 3, db: 'test'
69
+ d2.apply :segment, :tokenize, :tag, :category
70
+ puts d2.verb_count
71
+ #d.print_tree
72
+ #s = document 'http://www.economist.com'
73
+
74
+ p = phrase 'hello', 'world', '!'
75
+ puts p.to_s
76
+ puts p.to_str
77
+ =end
78
+
79
+ =begin
80
+ ### Super basics.
81
+ puts p.value
82
+
83
+ p << 'bitch'
84
+ p << word('hello')
85
+ puts p.to_s
86
+ puts p.to_str
87
+ puts p.value
88
+ puts p.to_ary.inspect
89
+ =end
90
+
91
+ =begin
92
+
93
+ ### Configuration
94
+
95
+ # A boolean value indicating whether to silence the output of external libraries (e.g. Stanford tools, Enju, LDA, Ruby-FANN) when they are used.
96
+ puts Treat.core.verbosity.silence
97
+ # A boolean value indicating whether to explain the steps that Treat is performing.
98
+ puts Treat.core.verbosity.debug
99
+ # A boolean value indicating whether Treat should try to detect the language of newly input text.
100
+ puts Treat.core.language.detect
101
+ # The language to default to when detection is off.
102
+ puts Treat.core.language.default
103
+ # A symbol representing the finest level at which language detection should be performed if language detection is turned on.
104
+ puts Treat.core.language.detect_at
105
+
106
+ # A directory in which to create temporary files.
107
+ puts Treat.paths.tmp
108
+ # A directory in which to store downloaded files.
109
+ puts Treat.paths.files
110
+ # A directory containing trained models for various tasks.
111
+ puts Treat.paths.models
112
+ # A directory containing the spec files.
113
+ puts Treat.paths.spec
114
+ # A directory containing executables and JAR files.
115
+ puts Treat.paths.bin
116
+ puts Treat.paths.lib
117
+
118
+ # Set up Mongoid.
119
+ Treat.databases.mongo.db = 'your_database'
120
+ Treat.databases.mongo.host = 'localhost'
121
+ Treat.databases.mongo.port = '27017'
122
+
123
+ # Transparent string casting.
124
+ s = 'inflection'.stem
125
+ # is equivalent to
126
+ s = 'inflection'.to_entity.stem
127
+ # which comes down to
128
+ s = word('inflection').stem
129
+
130
+ # Transparent number casting.
131
+ n = 2.ordinal
132
+ # is equivalent to
133
+ s = 2.to_entity.ordinal
134
+ # which comes down to
135
+ s = number(2).ordinal
136
+ =end
137
+ =begin
138
+ ### BASIC USAGE
139
+
140
+ # Create a sentence
141
+ s = sentence 'Those who dream by day know of at least ' +
142
+ '19 things that escape those who dream only at night.'
143
+
144
+ # Tokenize and tag it.
145
+ s.tokenize.tag
146
+
147
+ # View the sentence structure.
148
+ s.print_tree
149
+
150
+ # Iterate over the tokens.
151
+ s.each_token do |tok|
152
+ puts tok.value
153
+ puts tok.type
154
+ end
155
+
156
+
157
+
158
+ # Arrays instead of iterators.
159
+ (s.nouns + s.adjectives).each do |word|
160
+ puts word.synonyms
161
+ puts word.antonyms
162
+ end
163
+
164
+ # Functions on numbers.
165
+ s.each_number do |num|
166
+ puts num.ordinal
167
+ puts num.cardinal
168
+ end
169
+
170
+ # See all the annotations.
171
+ s.each do |tok|
172
+ puts tok.inspect
173
+ end
174
+
175
+ # Lazy way of doing all of the above.
176
+ s = sentence 'Those who dream by day know of at least ' +
177
+ '19 things that escape those who dream only at night.'
178
+
179
+ s.apply :tokenize, :tag, :category,
180
+ :stem, :hyponyms, :hypernyms,
181
+ :antonyms, :ordinal, :cardinal
182
+
183
+ =end
184
+
185
+ =begin
186
+ ### A BIT MORE ADVANCED USAGE
187
+
188
+ section = section "Obama-Sarkozy Meeting\n" +
189
+ "Obama and Sarkozy met on January 1st to investigate " +
190
+ "the possibility of a new rescue plan. President " +
191
+ "Sarkozy is to meet Merkel next Tuesday in Berlin."
192
+
193
+ # Chunk: split the titles and paragraphs.
194
+ # Segment: perform sentence segmentation.
195
+ # Parse: parse the syntax of each sentence.
196
+ section.apply :chunk, :segment, :parse
197
+
198
+ # View the tree structure.
199
+ section.print_tree
200
+
201
+ # Get some basic info on the text.
202
+ puts section.title
203
+ puts section.sentence_count
204
+ puts section.word_count
205
+
206
+ section.apply :category
207
+ puts section.noun_count
208
+ puts section.frequency_of 'president'
209
+
210
+ section.each_phrase_with_tag('NP') do |phrase|
211
+ puts phrase.to_s
212
+ end
213
+
214
+ =end
215
+ =begin
216
+ ### URL documents, XML serialization.
217
+
218
+ urls = ['http://www.cbc.ca/news/world/story/2012/11/25/snc-lavalin-ben-aissa-charges.html',
219
+ 'http://www.cbc.ca/news/world/story/2012/11/25/egypt.html', 'http://www.cbc.ca/news/canada/prince-edward-island/story/2012/11/25/pei-murder-arrest-stlucia.html', 'http://www.cbc.ca/news/world/story/2012/11/25/bangladesh-garment-factory-fire.html']
220
+
221
+ c = collection
222
+ urls.each { |url| c << document(url) }
223
+
224
+ # View the collection.
225
+ c.print_tree
226
+
227
+ c.apply :chunk, :segment, :tokenize
228
+ c.serialize :xml, :file => 'test.xml'
229
+
230
+ # Reopen the collection.
231
+ c = collection 'test.xml'
232
+
233
+ # View it again.
234
+ c.print_tree
235
+ =end
236
+ =begin
237
+ include Treat::Core::DSL
238
+
239
+ # Show progress bars for download.
240
+ Treat.core.verbosity.silence = false
241
+ # Explain what Treat is doing.
242
+ Treat.core.verbosity.debug = true
243
+
244
+ # Define the question "is it junk?" on sentences.
245
+ qn = question(:is_junk, :sentence)
246
+
247
+ # Frame the problem as depending on punctuation
248
+ # count and word count for each sentence.
249
+ pb = problem(qn,
250
+ feature(:punctuation_count),
251
+ feature(:word_count) )
252
+
253
+ # Get some web documents to work on.
254
+ url1 = 'http://en.wikipedia.org/wiki/NOD_mouse'
255
+ url2 = 'http://en.wikipedia.org/wiki/Academic_studies_about_Wikipedia'
256
+ d1, d2 = document(url1), document(url2)
257
+
258
+ # Process both of our documents.
259
+ [d1,d2].apply(:chunk, :segment, :tokenize)
260
+
261
+ # Answer our problem to create a training set.
262
+ d1.sentences[0..17].each { |s| s.set :is_junk, 0 }
263
+ d1.sentences[17..-1].each { |s| s.set :is_junk, 1 }
264
+ d_set = d1.export(pb)
265
+
266
+ # Define our gold standard results for evaluation.
267
+ d2.sentences[0..81].each { |s| s.set :is_true_junk, 0 }
268
+ d2.sentences[81..-1].each { |s| s.set :is_true_junk, 1 }
269
+
270
+ tp, fp, tn, fn = 0.0, 0.0, 0.0, 0.0
271
+
272
+ d2.sentences.map do |s|
273
+ pred = s.classify(:id3, training: d_set)
274
+ if pred == 1
275
+ tp += 1 if s.is_true_junk == 1
276
+ fp += 1 if s.is_true_junk == 0
277
+ else
278
+ tn += 1 if s.is_true_junk == 0
279
+ fn += 1 if s.is_true_junk == 1
280
+ end
281
+ end
282
+
283
+ puts "Precision: #{tp/(tp + fp)}"
284
+ puts "Recall: #{tp/(tp + fn)}"
285
+ =end
286
+ =begin
287
+ d = document 'http://louismullie.com/susan-text-scan1.jpg'
288
+ d.apply :chunk, :segment, :tokenize
289
+ d.print_tree
290
+ =end
291
+ =begin
292
+ # Syntax example
293
+ phra = phrase 'Obama', 'Sarkozy', 'Meeting'
294
+
295
+ para = paragraph 'Obama and Sarkozy met on January 1st to'
296
+ 'investigate the possibility of a new rescue plan. Nicolas ' +
297
+ 'Sarkozy is to meet Merkel next Tuesday in Berlin.'
298
+
299
+ sect = section title(phra), para
300
+ =end
301
+ =begin
302
+ puts "beer".plural.inspect
303
+ =end
304
+ # Treat.core.language.detect = true
305
+ # s = sentence "Du hast deiner Frau einen roten Ring gekauft."
306
+ #s.apply(:parse,:category).print_tree