treat 1.1.1 → 1.1.2

Sign up to get free protection for your applications and to get access to all the features.
@@ -0,0 +1 @@
1
+ {adapter: :mongo}
@@ -1,3 +1 @@
1
- {host: 'localhost',
2
- port: '27017',
3
- db: nil }
1
+ {host: 'localhost', port: '27017', db: nil }
@@ -77,10 +77,22 @@ class Treat::Core::DataSet
77
77
  end
78
78
  end
79
79
 
80
+ # Merge another data set into this one.
81
+ def merge(data_set)
82
+ if data_set.problem != @problem
83
+ raise Treat::Exception,
84
+ "Cannot merge two data sets that " +
85
+ "don't reference the same problem."
86
+ else
87
+ @items << data_set.items
88
+ @entities << data_set.entities
89
+ end
90
+ end
91
+
80
92
  # Unserialize a data set file created
81
93
  # by using the #serialize method.
82
94
  def self.unserialize(file)
83
- data = Marshal.load(File.read(file))
95
+ data = Marshal.load(File.binread(file))
84
96
  problem, items, entities = *data
85
97
  problem.features.each do |feature|
86
98
  next unless feature.proc
@@ -7,12 +7,12 @@ class Treat::Core::Feature
7
7
  # that the target of your classification
8
8
  # problem responds to the method
9
9
  # corresponding to this name.
10
- attr_accessor :name
10
+ attr_reader :name
11
11
  # A proc that can be used to perform
12
12
  # calculations before storing a feature.
13
13
  attr_accessor :proc
14
14
  # The default value to be
15
- attr_accessor :default
15
+ attr_reader :default
16
16
 
17
17
  # Initialize a feature for a classification
18
18
  # problem. If two arguments are supplied,
@@ -32,4 +32,11 @@ class Treat::Core::Feature
32
32
  end
33
33
  end
34
34
 
35
+ # Custom comparison operator for features.
36
+ def ==(feature)
37
+ @name == feature.name &&
38
+ @proc == feature.proc &&
39
+ @default == feature.default
40
+ end
41
+
35
42
  end
@@ -5,13 +5,13 @@
5
5
  class Treat::Core::Problem
6
6
 
7
7
  # The question we are trying to answer.
8
- attr_accessor :question
8
+ attr_reader :question
9
9
  # An array of features that will be
10
10
  # looked at in trying to answer the
11
11
  # problem's question.
12
- attr_accessor :features
12
+ attr_reader :features
13
13
  # Just the labels from the features.
14
- attr_accessor :labels
14
+ attr_reader :labels
15
15
 
16
16
  # Initialize the problem with a question
17
17
  # and an arbitrary number of features.
@@ -20,6 +20,12 @@ class Treat::Core::Problem
20
20
  @features = features
21
21
  @labels = @features.map { |f| f.name }
22
22
  end
23
+
24
+ # Custom comparison for problems.
25
+ def ==(problem)
26
+ @question == problem.question &&
27
+ @features == problem.features
28
+ end
23
29
 
24
30
  # Return an array of all the entity's
25
31
  # features, as defined by the problem.
@@ -7,15 +7,15 @@ class Treat::Core::Question
7
7
  # (e.g. is_key_sentence), which will
8
8
  # also be used as the annotation name
9
9
  # for the answer to the question.
10
- attr_accessor :name
10
+ attr_reader :name
11
11
  # Can be :continuous or :discrete,
12
12
  # depending on the features used.
13
- attr_accessor :type
13
+ attr_reader :type
14
14
  # Defines the target of the question
15
15
  # (e.g. :sentence, :paragraph, etc.)
16
- attr_accessor :target
16
+ attr_reader :target
17
17
  # Default for the answer to the question.
18
- attr_accessor :default
18
+ attr_reader :default
19
19
 
20
20
  # Initialize the question.
21
21
  def initialize(name, target,
@@ -24,4 +24,12 @@ class Treat::Core::Question
24
24
  @type, @default = type, default
25
25
  end
26
26
 
27
+ # Custom comparison operator for questions.
28
+ def ==(question)
29
+ @name == question.name &&
30
+ @type == question.type &&
31
+ @target == question.target &&
32
+ @default == question.default
33
+ end
34
+
27
35
  end
@@ -4,11 +4,11 @@
4
4
  # is pretty much self-explanatory.
5
5
  # FIXME how can we make this language independent?
6
6
  module Treat::Entities::Abilities::Buildable
7
-
7
+
8
8
  require 'schiphol'
9
9
  require 'fileutils'
10
10
  require 'uri'
11
-
11
+
12
12
  # Simple regexps to match common entities.
13
13
  WordRegexp = /^[[:alpha:]\-']+$/
14
14
  NumberRegexp = /^#?([0-9]+)(\.[0-9]+)?$/
@@ -16,7 +16,7 @@ module Treat::Entities::Abilities::Buildable
16
16
  UriRegexp = /^(http|https):\/\/[a-z0-9]+([\-\.]{1}[a-z0-9]+)*\.[a-z]{2,5}(([0-9]{1,5})?\/.*)?$/ix
17
17
  EmailRegexp = /.+\@.+\..+/
18
18
  Enclitics = %w['ll 'm 're 's 't 've]
19
-
19
+
20
20
  # Reserved folder names
21
21
  Reserved = ['.index']
22
22
 
@@ -26,8 +26,11 @@ module Treat::Entities::Abilities::Buildable
26
26
  def build(file_or_value, options = {})
27
27
 
28
28
  fv = file_or_value.to_s
29
- if self == Treat::Entities::Document ||
30
- (fv.index('yml') || fv.index('yaml') ||
29
+
30
+ if file_or_value.is_a?(Hash)
31
+ from_db(file_or_value)
32
+ elsif self == Treat::Entities::Document ||
33
+ (fv.index('yml') || fv.index('yaml') ||
31
34
  fv.index('xml') || fv.index('mongo'))
32
35
  if fv =~ UriRegexp
33
36
  from_url(fv, options)
@@ -52,7 +55,7 @@ module Treat::Entities::Abilities::Buildable
52
55
  "filename, string or number."
53
56
  end
54
57
  end
55
-
58
+
56
59
  end
57
60
 
58
61
  # Build an entity from a string. Type is
@@ -76,7 +79,7 @@ module Treat::Entities::Abilities::Buildable
76
79
  end
77
80
 
78
81
  e
79
-
82
+
80
83
  end
81
84
 
82
85
  # Build a document from an URL.
@@ -87,14 +90,14 @@ module Treat::Entities::Abilities::Buildable
87
90
  'Cannot create something ' +
88
91
  'else than a document from a url.'
89
92
  end
90
-
93
+
91
94
  f = Schiphol.download(url,
92
- :download_folder => Treat.paths.files,
93
- :show_progress => Treat.core.verbosity.silence,
94
- :rectify_extensions => true,
95
- :max_tries => 3
95
+ :download_folder => Treat.paths.files,
96
+ :show_progress => Treat.core.verbosity.silence,
97
+ :rectify_extensions => true,
98
+ :max_tries => 3
96
99
  )
97
-
100
+
98
101
  options[:default_to] ||= 'html'
99
102
 
100
103
  e = from_file(f, options)
@@ -123,7 +126,7 @@ module Treat::Entities::Abilities::Buildable
123
126
  def from_folder(folder, options)
124
127
 
125
128
  return if Reserved.include?(folder)
126
-
129
+
127
130
  unless FileTest.directory?(folder)
128
131
  raise Treat::Exception,
129
132
  "Path '#{folder}' does " +
@@ -163,9 +166,9 @@ module Treat::Entities::Abilities::Buildable
163
166
  # Build a document from a raw or serialized file.
164
167
  def from_file(file, options)
165
168
 
166
- if file.index('yml') ||
167
- file.index('yaml') ||
168
- file.index('xml') ||
169
+ if file.index('yml') ||
170
+ file.index('yaml') ||
171
+ file.index('xml') ||
169
172
  file.index('mongo')
170
173
  from_serialized_file(file, options)
171
174
  else
@@ -174,7 +177,7 @@ module Treat::Entities::Abilities::Buildable
174
177
  options[:_format] = fmt
175
178
  from_raw_file(file, options)
176
179
  end
177
-
180
+
178
181
  end
179
182
 
180
183
  # Build a document from a raw file.
@@ -186,13 +189,13 @@ module Treat::Entities::Abilities::Buildable
186
189
  "Cannot create something else than a " +
187
190
  "document from raw file '#{file}'."
188
191
  end
189
-
192
+
190
193
  unless File.readable?(file)
191
194
  raise Treat::Exception,
192
195
  "Path '#{file}' does not "+
193
196
  "point to a readable file."
194
197
  end
195
-
198
+
196
199
  d = Treat::Entities::Document.new(file)
197
200
 
198
201
  d.read(:autoselect, options)
@@ -217,13 +220,20 @@ module Treat::Entities::Abilities::Buildable
217
220
  d.children[0].set_as_root! # Fix this
218
221
  d.children[0]
219
222
  end
220
-
223
+
221
224
  end
222
-
223
- def from_db(adapter, options)
224
- id = options[:id]
225
- e = self.new(nil, id)
226
- e.unserialize(adapter, options)
225
+
226
+ def from_db(hash)
227
+ adapter = (hash.delete(:adapter) ||
228
+ Treat.databases.default.adapter)
229
+ unless adapter
230
+ raise Treat::Exception,
231
+ "You must supply which database " +
232
+ "adapter to use by passing the :adapter " +
233
+ "option or setting configuration option" +
234
+ "Treat.databases.default.adapter"
235
+ end
236
+ self.new.unserialize(adapter, hash)
227
237
  end
228
238
 
229
239
  # Build any kind of entity from a string.
@@ -321,10 +331,11 @@ module Treat::Entities::Abilities::Buildable
321
331
  end
322
332
 
323
333
  end
324
-
334
+
325
335
  def create_collection(fv)
326
336
  FileUtils.mkdir(fv)
327
337
  Treat::Entities::Collection.new(fv)
328
338
  end
329
339
 
340
+
330
341
  end
@@ -58,7 +58,7 @@ module Treat::Entities::Abilities::Debuggable
58
58
  worker.to_s.gsub('_', ' ')
59
59
  end
60
60
 
61
- curr.gsub!('ss', 's')
61
+ curr.gsub!('ss', 's') unless curr.index('class')
62
62
  curr += '.'
63
63
 
64
64
  if curr == @@prev
@@ -6,7 +6,10 @@ module Treat::Entities
6
6
  # containing the texts of the collection.
7
7
  def initialize(folder = nil, id = nil)
8
8
  super('', id)
9
- set :folder, folder
9
+ if folder && !FileTest.directory?(folder)
10
+ FileUtils.mkdir(folder)
11
+ end
12
+ set :folder, folder if folder
10
13
  i = folder + '/.index'
11
14
  set :index, i if FileTest.directory?(i)
12
15
  end
@@ -21,11 +24,13 @@ module Treat::Entities
21
24
  end
22
25
  entities.each do |entity|
23
26
  if [:document, :collection].
24
- include?(entity.type) && copy
27
+ include?(entity.type) && copy &&
28
+ @features[:folder] != nil
25
29
  entity = entity.copy_into(self)
26
30
  end
27
31
  end
28
32
  super(entities)
29
33
  end
34
+
30
35
  end
31
36
  end
data/lib/treat/version.rb CHANGED
@@ -1,3 +1,3 @@
1
1
  module Treat
2
- VERSION = "1.1.1"
2
+ VERSION = "1.1.2"
3
3
  end
@@ -31,10 +31,25 @@ class Treat::Workers::Formatters::Serializers::Mongo
31
31
  entity.type.to_s.capitalize.intern).superclass).downcase
32
32
  supertype = entity.type.to_s if supertype == 'entity'
33
33
  supertypes = supertype + 's'
34
-
34
+
35
35
  coll = @@database.collection(supertypes)
36
- entity_token = self.do_serialize(entity, options)
37
- coll.update({id: entity.id}, entity_token, {upsert: true})
36
+
37
+ if entity.type == :collection
38
+ docs = @@database.collection('documents')
39
+ coll.update(
40
+ {id: entity.id}, self.do_serialize(entity,
41
+ options.merge({:stop_at => Treat::Entities::Document})),
42
+ {upsert: true})
43
+ entity.each_document do |doc|
44
+ docs.update(
45
+ {id: doc.id}, self.do_serialize(doc, options),
46
+ {upsert: true})
47
+ end
48
+ else
49
+ entity_token = self.do_serialize(entity, options)
50
+ coll.update({id: entity.id}, entity_token, {upsert: true})
51
+ end
52
+
38
53
  end
39
54
 
40
55
  def self.do_serialize(entity, options)
@@ -52,7 +67,6 @@ class Treat::Workers::Formatters::Serializers::Mongo
52
67
  entity_token = {
53
68
  :id => entity.id,
54
69
  :value => entity.value,
55
- :string => entity.to_s,
56
70
  :type => entity.type.to_s,
57
71
  :children => children,
58
72
  :parent => (entity.has_parent? ? entity.parent.id : nil),
@@ -3,8 +3,11 @@ module Treat::Workers::Formatters::Unserializers::Mongo
3
3
  require 'mongo'
4
4
 
5
5
  def self.unserialize(entity, options={})
6
-
7
- if !Treat.databases.mongo.db && !options[:db]
6
+
7
+ db = options.delete(:db)
8
+ selector = options
9
+
10
+ if !Treat.databases.mongo.db && !db
8
11
  raise Treat::Exception,
9
12
  'Must supply the database name in config. ' +
10
13
  '(Treat.databases.mongo.db = ...) or pass ' +
@@ -13,7 +16,7 @@ module Treat::Workers::Formatters::Unserializers::Mongo
13
16
 
14
17
  @@database ||= Mongo::Connection.
15
18
  new(Treat.databases.mongo.host).
16
- db(Treat.databases.mongo.db || options[:db])
19
+ db(Treat.databases.mongo.db || db)
17
20
 
18
21
  supertype = cl(Treat::Entities.const_get(
19
22
  entity.type.to_s.capitalize.intern).superclass).downcase
@@ -21,14 +24,23 @@ module Treat::Workers::Formatters::Unserializers::Mongo
21
24
  supertypes = supertype + 's'
22
25
 
23
26
  coll = @@database.collection(supertypes)
24
- record = coll.find_one(:id => entity.id)
27
+ records = coll.find(selector).to_a
25
28
 
26
- unless record
29
+ if records.size == 0
27
30
  raise Treat::Exception,
28
- "Couldn't find record ID #{entity.id}."
31
+ "Couldn't find any records using " +
32
+ "selector #{selector.inspect}."
33
+ elsif records.size == 1
34
+ self.do_unserialize(
35
+ records.first, options)
36
+ else
37
+ matches = []
38
+ records.each do |record|
39
+ matches << self.
40
+ do_unserialize(record, options)
41
+ end
42
+ matches
29
43
  end
30
-
31
- self.do_unserialize(record, options)
32
44
 
33
45
  end
34
46
 
@@ -29,7 +29,7 @@ class Treat::Workers::Processors::Parsers::Stanford
29
29
  lang = entity.language
30
30
  init(lang, options)
31
31
 
32
- tag_set = StanfordCoreNLP::Config::TagSets[language]
32
+ tag_set = StanfordCoreNLP::Config::TagSets[lang]
33
33
 
34
34
  text = ::StanfordCoreNLP::Text.new(val)
35
35
  @@parsers[lang].annotate(text)
@@ -18,6 +18,8 @@ class Treat::Workers::Retrievers::Indexers::Ferret
18
18
  # index for future use (e.g. in searching).
19
19
  def self.index(collection, options = {})
20
20
 
21
+ # FIXME - what if the collection is stored
22
+ # in a database?
21
23
  path = "#{collection.folder}/.index"
22
24
  return path if FileTest.directory?(path)
23
25
 
data/spec/core.rb ADDED
@@ -0,0 +1,32 @@
1
+ require_relative '../lib/treat'
2
+
3
+ describe Treat::Core::DataSet do
4
+ =begin
5
+
6
+
7
+ p = Problem(
8
+ Question(:is_key_sentence, :sentence, false),
9
+ Feature(:word_count, 0)
10
+ )
11
+
12
+ p2 = Problem(
13
+ Question(:is_key_sentence, :sentence, false),
14
+ Feature(:word_count, 0)
15
+ )
16
+
17
+ ds = DataSet(p)
18
+
19
+ text = Paragraph("Welcome to the zoo! This is a text.")
20
+ text2 = Paragraph("Welcome here my friend. This is well, a text.")
21
+
22
+ text.do :segment, :tokenize
23
+ text2.do :segment, :tokenize
24
+
25
+ ds1 = text.export(p)
26
+ ds2 = text2.export(p2)
27
+
28
+ ds1.merge(ds2)
29
+
30
+ puts ds1.inspect
31
+ =end
32
+ end
data/spec/sandbox.rb ADDED
@@ -0,0 +1,13 @@
1
+ require_relative '../lib/treat'
2
+
3
+ Treat.databases.mongo.db = 'testingyetagain'
4
+
5
+ d = Document('http://www.cbc.ca/news/world/story/2012/07/21/syria-aleppo-clashes.html')
6
+
7
+ d2 = Document('http://www.cbc.ca/news/canada/british-columbia/story/2012/07/19/bc-monty-robinson-sentencing.html')
8
+
9
+ c = Collection('testing')
10
+ c << d
11
+ c << d2
12
+
13
+ c.serialize :mongo
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: treat
3
3
  version: !ruby/object:Gem::Version
4
- version: 1.1.1
4
+ version: 1.1.2
5
5
  prerelease:
6
6
  platform: ruby
7
7
  authors:
@@ -9,7 +9,7 @@ authors:
9
9
  autorequire:
10
10
  bindir: bin
11
11
  cert_chain: []
12
- date: 2012-07-19 00:00:00.000000000 Z
12
+ date: 2012-07-23 00:00:00.000000000 Z
13
13
  dependencies:
14
14
  - !ruby/object:Gem::Dependency
15
15
  name: schiphol
@@ -106,6 +106,7 @@ files:
106
106
  - lib/treat/config/core/paths.rb
107
107
  - lib/treat/config/core/syntax.rb
108
108
  - lib/treat/config/core/verbosity.rb
109
+ - lib/treat/config/databases/default.rb
109
110
  - lib/treat/config/databases/mongo.rb
110
111
  - lib/treat/config/languages/agnostic.rb
111
112
  - lib/treat/config/languages/arabic.rb
@@ -243,6 +244,7 @@ files:
243
244
  - lib/treat/workers.rb
244
245
  - lib/treat.rb
245
246
  - spec/collection.rb
247
+ - spec/core.rb
246
248
  - spec/document.rb
247
249
  - spec/entity.rb
248
250
  - spec/node.rb
@@ -252,12 +254,16 @@ files:
252
254
  - spec/samples/mathematicians/gauss.pdf
253
255
  - spec/samples/mathematicians/leibniz.txt
254
256
  - spec/samples/mathematicians/newton.doc
257
+ - spec/sandbox.rb
255
258
  - spec/token.rb
256
259
  - spec/treat.rb
257
260
  - spec/word.rb
258
261
  - spec/zone.rb
259
262
  - tmp/MANIFEST
263
+ - files/3_2_release_notes.html
264
+ - files/bc-monty-robinson-sentencing.html
260
265
  - files/MANIFEST
266
+ - files/syria-aleppo-clashes.html
261
267
  - README.md
262
268
  - LICENSE
263
269
  homepage: https://github.com/louismullie/treat