treat 2.0.4 → 2.0.5

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -54,7 +54,7 @@ class Treat::Helpers::String
54
54
  if @@cc_cache[o_phrase]
55
55
  return @@cc_cache[o_phrase]
56
56
  end
57
- if Treat.core.acronyms.include?(phrase)
57
+ if Treat.core.acronyms.include?(phrase.downcase)
58
58
  phrase = phrase.upcase
59
59
  else
60
60
  phrase.gsub!(Regex) { |a| a.upcase }
@@ -0,0 +1,48 @@
1
+ class Treat::Loaders::BindIt
2
+
3
+ # Keep track of whether its loaded or not.
4
+ @@loaded = {}
5
+
6
+ # Load CoreNLP package for a given language.
7
+ def self.load(klass, name, language = nil)
8
+
9
+ return if @@loaded[klass]
10
+
11
+ language ||= Treat.core.language.default
12
+
13
+ jar_path = Treat.libraries[name].jar_path ||
14
+ Treat.paths.bin + "#{name}/"
15
+ model_path = Treat.libraries[name].model_path ||
16
+ Treat.paths.models + "#{name}/"
17
+
18
+ if !File.directory?(jar_path)
19
+ raise Treat::Exception, "Looking for #{klass} " +
20
+ "library JAR files in #{jar_path}, but it is " +
21
+ "not a directory. Please set the config option " +
22
+ "Treat.libraries.#{name}.jar_path to a folder " +
23
+ "containing the appropriate JAR files."
24
+ end
25
+
26
+ if !File.directory?(model_path)
27
+ raise Treat::Exception, "Looking for #{klass} " +
28
+ "library model files in #{model_path}, but it " +
29
+ "is not a directory. Please set the config option " +
30
+ "Treat.libraries.#{name}.model_path to a folder " +
31
+ "containing the appropriate JAR files."
32
+ end
33
+
34
+ klass.jar_path = jar_path
35
+ klass.model_path = model_path
36
+ klass.use language
37
+
38
+ if Treat.core.verbosity.silence
39
+ klass.log_file = '/dev/null'
40
+ end
41
+
42
+ klass.bind
43
+
44
+ @@loaded[klass] = true
45
+
46
+ end
47
+
48
+ end
@@ -0,0 +1,12 @@
1
+ require 'treat/loaders/bind_it'
2
+
3
+ # A helper class to load the OpenNLP package.
4
+ class Treat::Loaders::OpenNLP < Treat::Loaders::BindIt
5
+
6
+ require 'open-nlp'
7
+
8
+ def self.load(language = nil)
9
+ super(OpenNLP, :open_nlp, language)
10
+ end
11
+
12
+ end
@@ -1,53 +1,14 @@
1
+ require 'treat/loaders/bind_it'
2
+
1
3
  # A helper class to load the CoreNLP package.
2
- class Treat::Loaders::Stanford
4
+ class Treat::Loaders::Stanford < Treat::Loaders::BindIt
3
5
 
4
- # Keep track of whether its loaded or not.
5
- @@loaded = false
6
+ require 'stanford-core-nlp'
6
7
 
7
- # Load CoreNLP package for a given language.
8
8
  def self.load(language = nil)
9
-
10
- return if @@loaded
11
-
12
- language ||= Treat.core.language.default
13
-
14
- jar_path = Treat.libraries.stanford.jar_path ||
15
- Treat.paths.bin + 'stanford/'
16
- model_path = Treat.libraries.stanford.model_path ||
17
- Treat.paths.models + 'stanford/'
18
-
19
- if !File.directory?(jar_path)
20
- raise Treat::Exception, "Looking for Stanford " +
21
- "CoreNLP JAR files in #{jar_path}, but it is " +
22
- "not a directory. Please set the config option " +
23
- "Treat.libraries.stanford.jar_path to a folder " +
24
- "containing the Stanford JAR files."
25
- end
26
-
27
- if !File.directory?(model_path)
28
- raise Treat::Exception, "Looking for Stanford " +
29
- "CoreNLP model files in #{model_path}, but it " +
30
- "is not a directory. Please set the config option " +
31
- "Treat.libraries.stanford.model_path to a folder " +
32
- "containing the Stanford JAR files."
33
- end
34
-
35
- require 'stanford-core-nlp'
36
-
37
- StanfordCoreNLP.jar_path = jar_path
38
- StanfordCoreNLP.model_path = model_path
39
- StanfordCoreNLP.use(language)
40
-
41
- if Treat.core.verbosity.silence
42
- StanfordCoreNLP.log_file = '/dev/null'
43
- end
44
-
45
- StanfordCoreNLP.bind
46
-
47
- @@loaded = true
48
-
9
+ super(StanfordCoreNLP, :stanford, language)
49
10
  end
50
-
11
+
51
12
  def self.find_model(name, language)
52
13
  language = language.intern
53
14
  model_file = StanfordCoreNLP::Config::Models[name][language]
@@ -57,4 +18,4 @@ class Treat::Loaders::Stanford
57
18
  File.join(model_path, model_dir, model_file)
58
19
  end
59
20
 
60
- end
21
+ end
@@ -11,14 +11,15 @@ module Treat::Proxies
11
11
  def method_missing(sym, *args, &block)
12
12
  if [:do, :apply].include?(sym) ||
13
13
  Treat::Workers.lookup(sym)
14
- to_entity.send(sym, *args)
14
+ to_entity.send(sym, *args)
15
15
  else
16
16
  super(sym, *args, &block)
17
17
  end
18
18
  end
19
+
19
20
  # Create an unknown type of entity by default.
20
21
  def to_entity(builder = nil)
21
- Treat::Entities::Unknown(self.to_s)
22
+ Treat::Entities::Unknown.new(self.to_s)
22
23
  end
23
24
  end
24
25
 
data/lib/treat/version.rb CHANGED
@@ -1,7 +1,7 @@
1
1
  module Treat
2
2
 
3
3
  # The current version of Treat.
4
- VERSION = "2.0.4"
4
+ VERSION = "2.0.5"
5
5
 
6
6
  # Treat requires Ruby >= 1.9.2
7
7
  if RUBY_VERSION < '1.9.2'
@@ -3,7 +3,7 @@ class Treat::Workers::Formatters::Readers::Autoselect
3
3
  ExtensionRegexp = /^.*?\.([a-zA-Z0-9]{2,5})$/
4
4
  ImageExtensions = ['gif', 'jpg', 'jpeg', 'png']
5
5
  DefaultOptions = {
6
- :default_to => 'txt'
6
+ :default_to => 'document'
7
7
  }
8
8
 
9
9
  # Choose a reader to use.
@@ -0,0 +1,17 @@
1
+ require 'yomu'
2
+
3
+ # This class is a wrapper for Yomu.
4
+ # Yomu is a library for extracting text and metadata from files and documents
5
+ # using the Apache Tika content analysis toolkit.
6
+ class Treat::Workers::Formatters::Readers::Document
7
+ # Extract the readable text from any document.
8
+ #
9
+ # Options: none.
10
+ def self.read(document, options = {})
11
+ yomu = Yomu.new(document.file)
12
+
13
+ document.value = yomu.text
14
+ document.set :format, yomu.mimetype.extensions.first
15
+ document
16
+ end
17
+ end
@@ -65,6 +65,7 @@ class Treat::Workers::Formatters::Unserializers::XML
65
65
  value = v
66
66
  else
67
67
  v = v[1..-1].intern if v[0] == ':'
68
+ v = ":".intern if v == :''
68
69
  v = v.to_i if v =~ /^[0-9]*$/
69
70
  v = v.to_f if v =~ /^[0-9\.]*$/
70
71
  v = false if v == 'false'
@@ -15,7 +15,7 @@ module Treat::Workers::Groupable
15
15
  require file
16
16
  if not self.const_defined?(const)
17
17
  raise Treat::Exception,
18
- "File #{file} does not define " +
18
+ "File #{file}.rb does not define " +
19
19
  "#{self}::#{const}."
20
20
  end
21
21
  const_get(const)
@@ -1,8 +1,10 @@
1
1
  # Maximum entropy tokenization supplied by OpenNLP.
2
- class Treat::Workers::Processors::Tokenizers::Maxent
2
+ class Treat::Workers::Processors::Tokenizers::OpenNlp
3
3
 
4
4
  require 'open-nlp'
5
- OpenNLP.load
5
+ Treat::Loaders::OpenNLP.load
6
+
7
+ @@tokenizers = {}
6
8
 
7
9
  # Maximum entropy tokenization.
8
10
  def self.tokenize(entity, options = {})
@@ -20,8 +22,7 @@ class Treat::Workers::Processors::Tokenizers::Maxent
20
22
  tokens = tokenizer.tokenize(str).to_a
21
23
 
22
24
  tokens.each do |token|
23
- entity << Treat::Entities
24
- ::Token.from_string(chunk)
25
+ entity << Treat::Entities::Token.from_string(token)
25
26
  end
26
27
 
27
28
  end
@@ -9,10 +9,10 @@ module Treat::Specs::Entities
9
9
  it "opens the file and reads its " +
10
10
  "content into a document" do
11
11
  f = Treat.paths.spec +
12
- 'workers/examples/english/mathematicians/leibniz.txt'
12
+ 'workers/examples/english/mathematicians/pythagoras.docx'
13
13
  d = Treat::Entities::Document.build(f)
14
14
  d.should be_an_instance_of Treat::Entities::Document
15
- d.to_s.index('Gottfried Leibniz').should_not eql nil
15
+ d.to_s.index('Pythagoras of Samos').should_not eql nil
16
16
  end
17
17
  end
18
18
 
data/spec/helper.rb CHANGED
@@ -13,6 +13,10 @@ module Treat::Specs
13
13
  '/ruby/stanford-core-nlp-minimal/models/'
14
14
  Treat.libraries.stanford.jar_path =
15
15
  '/ruby/stanford-core-nlp-minimal/bin/'
16
+ Treat.libraries.open_nlp.jar_path =
17
+ '/ruby/open-nlp-english/bin/'
18
+ Treat.libraries.open_nlp.model_path =
19
+ '/ruby/open-nlp-english/models/'
16
20
  Treat.libraries.punkt.model_path =
17
21
  '/ruby/punkt/models/'
18
22
  Treat.libraries.reuters.model_path =
data/spec/sandbox.rb ADDED
@@ -0,0 +1,306 @@
1
+ # encoding: utf-8
2
+ require_relative '../lib/treat'
3
+
4
+ Treat.databases.mongo.db = 'treat_test'
5
+ Treat.libraries.stanford.model_path =
6
+ '/ruby/stanford-core-nlp-minimal/models/'
7
+ Treat.libraries.stanford.jar_path =
8
+ '/ruby/stanford-core-nlp-minimal/bin/'
9
+ Treat.libraries.punkt.model_path =
10
+ '/ruby/punkt/models/'
11
+ Treat.libraries.reuters.model_path =
12
+ '/ruby/reuters/models/'
13
+ Treat.libraries.open_nlp.jar_path =
14
+ '/ruby/open-nlp-english/bin/'
15
+ Treat.libraries.open_nlp.model_path =
16
+ '/ruby/open-nlp-english/models/'
17
+ Treat.core.verbosity.silence = false
18
+
19
+ include Treat::Core::DSL
20
+
21
+ s = sentence "This is a sentence to parse!"
22
+ s.tokenize(:open_nlp).parse
23
+ s.print_tree
24
+
25
+ =begin
26
+ Treat::Builder.new do
27
+ p = phrase "26 Feb"
28
+ p.tokenize.time :kronic
29
+ puts p.inspect
30
+ s = sentence "Hello, world!"
31
+ s2 = sentence "Hello world"
32
+ puts s.similarity :jaro_winkler, to: s2
33
+ puts s.distance :levenshtein, to: s2
34
+ # puts s.similarity :tf_idf, to: s2
35
+ end
36
+
37
+ g = group("I was running")
38
+ puts g.tag.inspect
39
+
40
+ Treat.libraries.stanford.jar_path = '/ruby/treat/bin/'
41
+ Treat.libraries.stanford.model_path = '/ruby/treat/models/'
42
+
43
+ p = paragraph
44
+ s = sentence
45
+ w = word
46
+
47
+ p = phrase 'hello world'
48
+ e = email 'louis@gmail.com'
49
+
50
+ d = question(:is_feature, :word)
51
+ =end
52
+ #d = document Treat.paths.spec + 'workers/examples/english/economist/hungarys_troubles.txt'
53
+ #d.apply :chunk, :segment, :tokenize, :tag, :category, :name_tag
54
+ #d.print_tree
55
+ #d = document Treat.paths.spec + 'workers/examples/english/economist/saving_the_euro.odt'
56
+ #d.print_tree
57
+ =begin
58
+ d = document 'test.htm'
59
+ d.apply :chunk
60
+ #d.serialize :yaml, file: 'test444.yaml'
61
+ d.set :test, 2
62
+ d.serialize :mongo, db: 'test'
63
+ d.set :test, 3
64
+ d.serialize :mongo, db: 'test'
65
+ d.apply :segment, :tokenize, :tag, :category
66
+ puts d.verb_count
67
+ #d2 = document id: d.id, db: 'test'
68
+ d2 = document 'features.test' => 3, db: 'test'
69
+ d2.apply :segment, :tokenize, :tag, :category
70
+ puts d2.verb_count
71
+ #d.print_tree
72
+ #s = document 'http://www.economist.com'
73
+
74
+ p = phrase 'hello', 'world', '!'
75
+ puts p.to_s
76
+ puts p.to_str
77
+ =end
78
+
79
+ =begin
80
+ ### Super basics.
81
+ puts p.value
82
+
83
+ p << 'bitch'
84
+ p << word('hello')
85
+ puts p.to_s
86
+ puts p.to_str
87
+ puts p.value
88
+ puts p.to_ary.inspect
89
+ =end
90
+
91
+ =begin
92
+
93
+ ### Configuration
94
+
95
+ # A boolean value indicating whether to silence the output of external libraries (e.g. Stanford tools, Enju, LDA, Ruby-FANN) when they are used.
96
+ puts Treat.core.verbosity.silence
97
+ # A boolean value indicating whether to explain the steps that Treat is performing.
98
+ puts Treat.core.verbosity.debug
99
+ # A boolean value indicating whether Treat should try to detect the language of newly input text.
100
+ puts Treat.core.language.detect
101
+ # The language to default to when detection is off.
102
+ puts Treat.core.language.default
103
+ # A symbol representing the finest level at which language detection should be performed if language detection is turned on.
104
+ puts Treat.core.language.detect_at
105
+
106
+ # A directory in which to create temporary files.
107
+ puts Treat.paths.tmp
108
+ # A directory in which to store downloaded files.
109
+ puts Treat.paths.files
110
+ # A directory containing trained models for various tasks.
111
+ puts Treat.paths.models
112
+ # A directory containing the spec files.
113
+ puts Treat.paths.spec
114
+ # A directory containing executables and JAR files.
115
+ puts Treat.paths.bin
116
+ puts Treat.paths.lib
117
+
118
+ # Set up Mongoid.
119
+ Treat.databases.mongo.db = 'your_database'
120
+ Treat.databases.mongo.host = 'localhost'
121
+ Treat.databases.mongo.port = '27017'
122
+
123
+ # Transparent string casting.
124
+ s = 'inflection'.stem
125
+ # is equivalent to
126
+ s = 'inflection'.to_entity.stem
127
+ # which comes down to
128
+ s = word('inflection').stem
129
+
130
+ # Transparent number casting.
131
+ n = 2.ordinal
132
+ # is equivalent to
133
+ s = 2.to_entity.ordinal
134
+ # which comes down to
135
+ s = number(2).ordinal
136
+ =end
137
+ =begin
138
+ ### BASIC USAGE
139
+
140
+ # Create a sentence
141
+ s = sentence 'Those who dream by day know of at least ' +
142
+ '19 things that escape those who dream only at night.'
143
+
144
+ # Tokenize and tag it.
145
+ s.tokenize.tag
146
+
147
+ # View the sentence structure.
148
+ s.print_tree
149
+
150
+ # Iterate over the tokens.
151
+ s.each_token do |tok|
152
+ puts tok.value
153
+ puts tok.type
154
+ end
155
+
156
+
157
+
158
+ # Arrays instead of iterators.
159
+ (s.nouns + s.adjectives).each do |word|
160
+ puts word.synonyms
161
+ puts word.antonyms
162
+ end
163
+
164
+ # Functions on numbers.
165
+ s.each_number do |num|
166
+ puts num.ordinal
167
+ puts num.cardinal
168
+ end
169
+
170
+ # See all the annotations.
171
+ s.each do |tok|
172
+ puts tok.inspect
173
+ end
174
+
175
+ # Lazy way of doing all of the above.
176
+ s = sentence 'Those who dream by day know of at least ' +
177
+ '19 things that escape those who dream only at night.'
178
+
179
+ s.apply :tokenize, :tag, :category,
180
+ :stem, :hyponyms, :hypernyms,
181
+ :antonyms, :ordinal, :cardinal
182
+
183
+ =end
184
+
185
+ =begin
186
+ ### A BIT MORE ADVANCED USAGE
187
+
188
+ section = section "Obama-Sarkozy Meeting\n" +
189
+ "Obama and Sarkozy met on January 1st to investigate " +
190
+ "the possibility of a new rescue plan. President " +
191
+ "Sarkozy is to meet Merkel next Tuesday in Berlin."
192
+
193
+ # Chunk: split the titles and paragraphs.
194
+ # Segment: perform sentence segmentation.
195
+ # Parse: parse the syntax of each sentence.
196
+ section.apply :chunk, :segment, :parse
197
+
198
+ # View the tree structure.
199
+ section.print_tree
200
+
201
+ # Get some basic info on the text.
202
+ puts section.title
203
+ puts section.sentence_count
204
+ puts section.word_count
205
+
206
+ section.apply :category
207
+ puts section.noun_count
208
+ puts section.frequency_of 'president'
209
+
210
+ section.each_phrase_with_tag('NP') do |phrase|
211
+ puts phrase.to_s
212
+ end
213
+
214
+ =end
215
+ =begin
216
+ ### URL documents, XML serialization.
217
+
218
+ urls = ['http://www.cbc.ca/news/world/story/2012/11/25/snc-lavalin-ben-aissa-charges.html',
219
+ 'http://www.cbc.ca/news/world/story/2012/11/25/egypt.html', 'http://www.cbc.ca/news/canada/prince-edward-island/story/2012/11/25/pei-murder-arrest-stlucia.html', 'http://www.cbc.ca/news/world/story/2012/11/25/bangladesh-garment-factory-fire.html']
220
+
221
+ c = collection
222
+ urls.each { |url| c << document(url) }
223
+
224
+ # View the collection.
225
+ c.print_tree
226
+
227
+ c.apply :chunk, :segment, :tokenize
228
+ c.serialize :xml, :file => 'test.xml'
229
+
230
+ # Reopen the collection.
231
+ c = collection 'test.xml'
232
+
233
+ # View it again.
234
+ c.print_tree
235
+ =end
236
+ =begin
237
+ include Treat::Core::DSL
238
+
239
+ # Show progress bars for download.
240
+ Treat.core.verbosity.silence = false
241
+ # Explain what Treat is doing.
242
+ Treat.core.verbosity.debug = true
243
+
244
+ # Define the question "is it junk?" on sentences.
245
+ qn = question(:is_junk, :sentence)
246
+
247
+ # Frame the problem as depending on punctuation
248
+ # count and word count for each sentence.
249
+ pb = problem(qn,
250
+ feature(:punctuation_count),
251
+ feature(:word_count) )
252
+
253
+ # Get some web documents to work on.
254
+ url1 = 'http://en.wikipedia.org/wiki/NOD_mouse'
255
+ url2 = 'http://en.wikipedia.org/wiki/Academic_studies_about_Wikipedia'
256
+ d1, d2 = document(url1), document(url2)
257
+
258
+ # Process both of our documents.
259
+ [d1,d2].apply(:chunk, :segment, :tokenize)
260
+
261
+ # Answer our problem to create a training set.
262
+ d1.sentences[0..17].each { |s| s.set :is_junk, 0 }
263
+ d1.sentences[17..-1].each { |s| s.set :is_junk, 1 }
264
+ d_set = d1.export(pb)
265
+
266
+ # Define our gold standard results for evaluation.
267
+ d2.sentences[0..81].each { |s| s.set :is_true_junk, 0 }
268
+ d2.sentences[81..-1].each { |s| s.set :is_true_junk, 1 }
269
+
270
+ tp, fp, tn, fn = 0.0, 0.0, 0.0, 0.0
271
+
272
+ d2.sentences.map do |s|
273
+ pred = s.classify(:id3, training: d_set)
274
+ if pred == 1
275
+ tp += 1 if s.is_true_junk == 1
276
+ fp += 1 if s.is_true_junk == 0
277
+ else
278
+ tn += 1 if s.is_true_junk == 0
279
+ fn += 1 if s.is_true_junk == 1
280
+ end
281
+ end
282
+
283
+ puts "Precision: #{tp/(tp + fp)}"
284
+ puts "Recall: #{tp/(tp + fn)}"
285
+ =end
286
+ =begin
287
+ d = document 'http://louismullie.com/susan-text-scan1.jpg'
288
+ d.apply :chunk, :segment, :tokenize
289
+ d.print_tree
290
+ =end
291
+ =begin
292
+ # Syntax example
293
+ phra = phrase 'Obama', 'Sarkozy', 'Meeting'
294
+
295
+ para = paragraph 'Obama and Sarkozy met on January 1st to'
296
+ 'investigate the possibility of a new rescue plan. Nicolas ' +
297
+ 'Sarkozy is to meet Merkel next Tuesday in Berlin.'
298
+
299
+ sect = section title(phra), para
300
+ =end
301
+ =begin
302
+ puts "beer".plural.inspect
303
+ =end
304
+ # Treat.core.language.detect = true
305
+ # s = sentence "Du hast deiner Frau einen roten Ring gekauft."
306
+ #s.apply(:parse,:category).print_tree