treat 1.1.1 → 1.1.2
Sign up to get free protection for your applications and to get access to all the features.
- data/files/3_2_release_notes.html +766 -0
- data/files/bc-monty-robinson-sentencing.html +1569 -0
- data/files/syria-aleppo-clashes.html +1376 -0
- data/lib/treat/config/databases/default.rb +1 -0
- data/lib/treat/config/databases/mongo.rb +1 -3
- data/lib/treat/core/data_set.rb +13 -1
- data/lib/treat/core/feature.rb +9 -2
- data/lib/treat/core/problem.rb +9 -3
- data/lib/treat/core/question.rb +12 -4
- data/lib/treat/entities/abilities/buildable.rb +38 -27
- data/lib/treat/entities/abilities/debuggable.rb +1 -1
- data/lib/treat/entities/collection.rb +7 -2
- data/lib/treat/version.rb +1 -1
- data/lib/treat/workers/formatters/serializers/mongo.rb +18 -4
- data/lib/treat/workers/formatters/unserializers/mongo.rb +20 -8
- data/lib/treat/workers/processors/parsers/stanford.rb +1 -1
- data/lib/treat/workers/retrievers/indexers/ferret.rb +2 -0
- data/spec/core.rb +32 -0
- data/spec/sandbox.rb +13 -0
- metadata +8 -2
@@ -0,0 +1 @@
|
|
1
|
+
{adapter: :mongo}
|
data/lib/treat/core/data_set.rb
CHANGED
@@ -77,10 +77,22 @@ class Treat::Core::DataSet
|
|
77
77
|
end
|
78
78
|
end
|
79
79
|
|
80
|
+
# Merge another data set into this one.
|
81
|
+
def merge(data_set)
|
82
|
+
if data_set.problem != @problem
|
83
|
+
raise Treat::Exception,
|
84
|
+
"Cannot merge two data sets that " +
|
85
|
+
"don't reference the same problem."
|
86
|
+
else
|
87
|
+
@items << data_set.items
|
88
|
+
@entities << data_set.entities
|
89
|
+
end
|
90
|
+
end
|
91
|
+
|
80
92
|
# Unserialize a data set file created
|
81
93
|
# by using the #serialize method.
|
82
94
|
def self.unserialize(file)
|
83
|
-
data = Marshal.load(File.
|
95
|
+
data = Marshal.load(File.binread(file))
|
84
96
|
problem, items, entities = *data
|
85
97
|
problem.features.each do |feature|
|
86
98
|
next unless feature.proc
|
data/lib/treat/core/feature.rb
CHANGED
@@ -7,12 +7,12 @@ class Treat::Core::Feature
|
|
7
7
|
# that the target of your classification
|
8
8
|
# problem responds to the method
|
9
9
|
# corresponding to this name.
|
10
|
-
|
10
|
+
attr_reader :name
|
11
11
|
# A proc that can be used to perform
|
12
12
|
# calculations before storing a feature.
|
13
13
|
attr_accessor :proc
|
14
14
|
# The default value to be
|
15
|
-
|
15
|
+
attr_reader :default
|
16
16
|
|
17
17
|
# Initialize a feature for a classification
|
18
18
|
# problem. If two arguments are supplied,
|
@@ -32,4 +32,11 @@ class Treat::Core::Feature
|
|
32
32
|
end
|
33
33
|
end
|
34
34
|
|
35
|
+
# Custom comparison operator for features.
|
36
|
+
def ==(feature)
|
37
|
+
@name == feature.name &&
|
38
|
+
@proc == feature.proc &&
|
39
|
+
@default == feature.default
|
40
|
+
end
|
41
|
+
|
35
42
|
end
|
data/lib/treat/core/problem.rb
CHANGED
@@ -5,13 +5,13 @@
|
|
5
5
|
class Treat::Core::Problem
|
6
6
|
|
7
7
|
# The question we are trying to answer.
|
8
|
-
|
8
|
+
attr_reader :question
|
9
9
|
# An array of features that will be
|
10
10
|
# looked at in trying to answer the
|
11
11
|
# problem's question.
|
12
|
-
|
12
|
+
attr_reader :features
|
13
13
|
# Just the labels from the features.
|
14
|
-
|
14
|
+
attr_reader :labels
|
15
15
|
|
16
16
|
# Initialize the problem with a question
|
17
17
|
# and an arbitrary number of features.
|
@@ -20,6 +20,12 @@ class Treat::Core::Problem
|
|
20
20
|
@features = features
|
21
21
|
@labels = @features.map { |f| f.name }
|
22
22
|
end
|
23
|
+
|
24
|
+
# Custom comparison for problems.
|
25
|
+
def ==(problem)
|
26
|
+
@question == problem.question &&
|
27
|
+
@features == problem.features
|
28
|
+
end
|
23
29
|
|
24
30
|
# Return an array of all the entity's
|
25
31
|
# features, as defined by the problem.
|
data/lib/treat/core/question.rb
CHANGED
@@ -7,15 +7,15 @@ class Treat::Core::Question
|
|
7
7
|
# (e.g. is_key_sentence), which will
|
8
8
|
# also be used as the annotation name
|
9
9
|
# for the answer to the question.
|
10
|
-
|
10
|
+
attr_reader :name
|
11
11
|
# Can be :continuous or :discrete,
|
12
12
|
# depending on the features used.
|
13
|
-
|
13
|
+
attr_reader :type
|
14
14
|
# Defines the target of the question
|
15
15
|
# (e.g. :sentence, :paragraph, etc.)
|
16
|
-
|
16
|
+
attr_reader :target
|
17
17
|
# Default for the answer to the question.
|
18
|
-
|
18
|
+
attr_reader :default
|
19
19
|
|
20
20
|
# Initialize the question.
|
21
21
|
def initialize(name, target,
|
@@ -24,4 +24,12 @@ class Treat::Core::Question
|
|
24
24
|
@type, @default = type, default
|
25
25
|
end
|
26
26
|
|
27
|
+
# Custom comparison operator for questions.
|
28
|
+
def ==(question)
|
29
|
+
@name == question.name &&
|
30
|
+
@type == question.type &&
|
31
|
+
@target == question.target &&
|
32
|
+
@default == question.default
|
33
|
+
end
|
34
|
+
|
27
35
|
end
|
@@ -4,11 +4,11 @@
|
|
4
4
|
# is pretty much self-explanatory.
|
5
5
|
# FIXME how can we make this language independent?
|
6
6
|
module Treat::Entities::Abilities::Buildable
|
7
|
-
|
7
|
+
|
8
8
|
require 'schiphol'
|
9
9
|
require 'fileutils'
|
10
10
|
require 'uri'
|
11
|
-
|
11
|
+
|
12
12
|
# Simple regexps to match common entities.
|
13
13
|
WordRegexp = /^[[:alpha:]\-']+$/
|
14
14
|
NumberRegexp = /^#?([0-9]+)(\.[0-9]+)?$/
|
@@ -16,7 +16,7 @@ module Treat::Entities::Abilities::Buildable
|
|
16
16
|
UriRegexp = /^(http|https):\/\/[a-z0-9]+([\-\.]{1}[a-z0-9]+)*\.[a-z]{2,5}(([0-9]{1,5})?\/.*)?$/ix
|
17
17
|
EmailRegexp = /.+\@.+\..+/
|
18
18
|
Enclitics = %w['ll 'm 're 's 't 've]
|
19
|
-
|
19
|
+
|
20
20
|
# Reserved folder names
|
21
21
|
Reserved = ['.index']
|
22
22
|
|
@@ -26,8 +26,11 @@ module Treat::Entities::Abilities::Buildable
|
|
26
26
|
def build(file_or_value, options = {})
|
27
27
|
|
28
28
|
fv = file_or_value.to_s
|
29
|
-
|
30
|
-
|
29
|
+
|
30
|
+
if file_or_value.is_a?(Hash)
|
31
|
+
from_db(file_or_value)
|
32
|
+
elsif self == Treat::Entities::Document ||
|
33
|
+
(fv.index('yml') || fv.index('yaml') ||
|
31
34
|
fv.index('xml') || fv.index('mongo'))
|
32
35
|
if fv =~ UriRegexp
|
33
36
|
from_url(fv, options)
|
@@ -52,7 +55,7 @@ module Treat::Entities::Abilities::Buildable
|
|
52
55
|
"filename, string or number."
|
53
56
|
end
|
54
57
|
end
|
55
|
-
|
58
|
+
|
56
59
|
end
|
57
60
|
|
58
61
|
# Build an entity from a string. Type is
|
@@ -76,7 +79,7 @@ module Treat::Entities::Abilities::Buildable
|
|
76
79
|
end
|
77
80
|
|
78
81
|
e
|
79
|
-
|
82
|
+
|
80
83
|
end
|
81
84
|
|
82
85
|
# Build a document from an URL.
|
@@ -87,14 +90,14 @@ module Treat::Entities::Abilities::Buildable
|
|
87
90
|
'Cannot create something ' +
|
88
91
|
'else than a document from a url.'
|
89
92
|
end
|
90
|
-
|
93
|
+
|
91
94
|
f = Schiphol.download(url,
|
92
|
-
|
93
|
-
|
94
|
-
|
95
|
-
|
95
|
+
:download_folder => Treat.paths.files,
|
96
|
+
:show_progress => Treat.core.verbosity.silence,
|
97
|
+
:rectify_extensions => true,
|
98
|
+
:max_tries => 3
|
96
99
|
)
|
97
|
-
|
100
|
+
|
98
101
|
options[:default_to] ||= 'html'
|
99
102
|
|
100
103
|
e = from_file(f, options)
|
@@ -123,7 +126,7 @@ module Treat::Entities::Abilities::Buildable
|
|
123
126
|
def from_folder(folder, options)
|
124
127
|
|
125
128
|
return if Reserved.include?(folder)
|
126
|
-
|
129
|
+
|
127
130
|
unless FileTest.directory?(folder)
|
128
131
|
raise Treat::Exception,
|
129
132
|
"Path '#{folder}' does " +
|
@@ -163,9 +166,9 @@ module Treat::Entities::Abilities::Buildable
|
|
163
166
|
# Build a document from a raw or serialized file.
|
164
167
|
def from_file(file, options)
|
165
168
|
|
166
|
-
if file.index('yml') ||
|
167
|
-
file.index('yaml') ||
|
168
|
-
file.index('xml') ||
|
169
|
+
if file.index('yml') ||
|
170
|
+
file.index('yaml') ||
|
171
|
+
file.index('xml') ||
|
169
172
|
file.index('mongo')
|
170
173
|
from_serialized_file(file, options)
|
171
174
|
else
|
@@ -174,7 +177,7 @@ module Treat::Entities::Abilities::Buildable
|
|
174
177
|
options[:_format] = fmt
|
175
178
|
from_raw_file(file, options)
|
176
179
|
end
|
177
|
-
|
180
|
+
|
178
181
|
end
|
179
182
|
|
180
183
|
# Build a document from a raw file.
|
@@ -186,13 +189,13 @@ module Treat::Entities::Abilities::Buildable
|
|
186
189
|
"Cannot create something else than a " +
|
187
190
|
"document from raw file '#{file}'."
|
188
191
|
end
|
189
|
-
|
192
|
+
|
190
193
|
unless File.readable?(file)
|
191
194
|
raise Treat::Exception,
|
192
195
|
"Path '#{file}' does not "+
|
193
196
|
"point to a readable file."
|
194
197
|
end
|
195
|
-
|
198
|
+
|
196
199
|
d = Treat::Entities::Document.new(file)
|
197
200
|
|
198
201
|
d.read(:autoselect, options)
|
@@ -217,13 +220,20 @@ module Treat::Entities::Abilities::Buildable
|
|
217
220
|
d.children[0].set_as_root! # Fix this
|
218
221
|
d.children[0]
|
219
222
|
end
|
220
|
-
|
223
|
+
|
221
224
|
end
|
222
|
-
|
223
|
-
def from_db(
|
224
|
-
|
225
|
-
|
226
|
-
|
225
|
+
|
226
|
+
def from_db(hash)
|
227
|
+
adapter = (hash.delete(:adapter) ||
|
228
|
+
Treat.databases.default.adapter)
|
229
|
+
unless adapter
|
230
|
+
raise Treat::Exception,
|
231
|
+
"You must supply which database " +
|
232
|
+
"adapter to use by passing the :adapter " +
|
233
|
+
"option or setting configuration option" +
|
234
|
+
"Treat.databases.default.adapter"
|
235
|
+
end
|
236
|
+
self.new.unserialize(adapter, hash)
|
227
237
|
end
|
228
238
|
|
229
239
|
# Build any kind of entity from a string.
|
@@ -321,10 +331,11 @@ module Treat::Entities::Abilities::Buildable
|
|
321
331
|
end
|
322
332
|
|
323
333
|
end
|
324
|
-
|
334
|
+
|
325
335
|
def create_collection(fv)
|
326
336
|
FileUtils.mkdir(fv)
|
327
337
|
Treat::Entities::Collection.new(fv)
|
328
338
|
end
|
329
339
|
|
340
|
+
|
330
341
|
end
|
@@ -6,7 +6,10 @@ module Treat::Entities
|
|
6
6
|
# containing the texts of the collection.
|
7
7
|
def initialize(folder = nil, id = nil)
|
8
8
|
super('', id)
|
9
|
-
|
9
|
+
if folder && !FileTest.directory?(folder)
|
10
|
+
FileUtils.mkdir(folder)
|
11
|
+
end
|
12
|
+
set :folder, folder if folder
|
10
13
|
i = folder + '/.index'
|
11
14
|
set :index, i if FileTest.directory?(i)
|
12
15
|
end
|
@@ -21,11 +24,13 @@ module Treat::Entities
|
|
21
24
|
end
|
22
25
|
entities.each do |entity|
|
23
26
|
if [:document, :collection].
|
24
|
-
include?(entity.type) && copy
|
27
|
+
include?(entity.type) && copy &&
|
28
|
+
@features[:folder] != nil
|
25
29
|
entity = entity.copy_into(self)
|
26
30
|
end
|
27
31
|
end
|
28
32
|
super(entities)
|
29
33
|
end
|
34
|
+
|
30
35
|
end
|
31
36
|
end
|
data/lib/treat/version.rb
CHANGED
@@ -31,10 +31,25 @@ class Treat::Workers::Formatters::Serializers::Mongo
|
|
31
31
|
entity.type.to_s.capitalize.intern).superclass).downcase
|
32
32
|
supertype = entity.type.to_s if supertype == 'entity'
|
33
33
|
supertypes = supertype + 's'
|
34
|
-
|
34
|
+
|
35
35
|
coll = @@database.collection(supertypes)
|
36
|
-
|
37
|
-
|
36
|
+
|
37
|
+
if entity.type == :collection
|
38
|
+
docs = @@database.collection('documents')
|
39
|
+
coll.update(
|
40
|
+
{id: entity.id}, self.do_serialize(entity,
|
41
|
+
options.merge({:stop_at => Treat::Entities::Document})),
|
42
|
+
{upsert: true})
|
43
|
+
entity.each_document do |doc|
|
44
|
+
docs.update(
|
45
|
+
{id: doc.id}, self.do_serialize(doc, options),
|
46
|
+
{upsert: true})
|
47
|
+
end
|
48
|
+
else
|
49
|
+
entity_token = self.do_serialize(entity, options)
|
50
|
+
coll.update({id: entity.id}, entity_token, {upsert: true})
|
51
|
+
end
|
52
|
+
|
38
53
|
end
|
39
54
|
|
40
55
|
def self.do_serialize(entity, options)
|
@@ -52,7 +67,6 @@ class Treat::Workers::Formatters::Serializers::Mongo
|
|
52
67
|
entity_token = {
|
53
68
|
:id => entity.id,
|
54
69
|
:value => entity.value,
|
55
|
-
:string => entity.to_s,
|
56
70
|
:type => entity.type.to_s,
|
57
71
|
:children => children,
|
58
72
|
:parent => (entity.has_parent? ? entity.parent.id : nil),
|
@@ -3,8 +3,11 @@ module Treat::Workers::Formatters::Unserializers::Mongo
|
|
3
3
|
require 'mongo'
|
4
4
|
|
5
5
|
def self.unserialize(entity, options={})
|
6
|
-
|
7
|
-
|
6
|
+
|
7
|
+
db = options.delete(:db)
|
8
|
+
selector = options
|
9
|
+
|
10
|
+
if !Treat.databases.mongo.db && !db
|
8
11
|
raise Treat::Exception,
|
9
12
|
'Must supply the database name in config. ' +
|
10
13
|
'(Treat.databases.mongo.db = ...) or pass ' +
|
@@ -13,7 +16,7 @@ module Treat::Workers::Formatters::Unserializers::Mongo
|
|
13
16
|
|
14
17
|
@@database ||= Mongo::Connection.
|
15
18
|
new(Treat.databases.mongo.host).
|
16
|
-
db(Treat.databases.mongo.db ||
|
19
|
+
db(Treat.databases.mongo.db || db)
|
17
20
|
|
18
21
|
supertype = cl(Treat::Entities.const_get(
|
19
22
|
entity.type.to_s.capitalize.intern).superclass).downcase
|
@@ -21,14 +24,23 @@ module Treat::Workers::Formatters::Unserializers::Mongo
|
|
21
24
|
supertypes = supertype + 's'
|
22
25
|
|
23
26
|
coll = @@database.collection(supertypes)
|
24
|
-
|
27
|
+
records = coll.find(selector).to_a
|
25
28
|
|
26
|
-
|
29
|
+
if records.size == 0
|
27
30
|
raise Treat::Exception,
|
28
|
-
"Couldn't find
|
31
|
+
"Couldn't find any records using " +
|
32
|
+
"selector #{selector.inspect}."
|
33
|
+
elsif records.size == 1
|
34
|
+
self.do_unserialize(
|
35
|
+
records.first, options)
|
36
|
+
else
|
37
|
+
matches = []
|
38
|
+
records.each do |record|
|
39
|
+
matches << self.
|
40
|
+
do_unserialize(record, options)
|
41
|
+
end
|
42
|
+
matches
|
29
43
|
end
|
30
|
-
|
31
|
-
self.do_unserialize(record, options)
|
32
44
|
|
33
45
|
end
|
34
46
|
|
@@ -29,7 +29,7 @@ class Treat::Workers::Processors::Parsers::Stanford
|
|
29
29
|
lang = entity.language
|
30
30
|
init(lang, options)
|
31
31
|
|
32
|
-
tag_set = StanfordCoreNLP::Config::TagSets[
|
32
|
+
tag_set = StanfordCoreNLP::Config::TagSets[lang]
|
33
33
|
|
34
34
|
text = ::StanfordCoreNLP::Text.new(val)
|
35
35
|
@@parsers[lang].annotate(text)
|
@@ -18,6 +18,8 @@ class Treat::Workers::Retrievers::Indexers::Ferret
|
|
18
18
|
# index for future use (e.g. in searching).
|
19
19
|
def self.index(collection, options = {})
|
20
20
|
|
21
|
+
# FIXME - what if the collection is stored
|
22
|
+
# in a database?
|
21
23
|
path = "#{collection.folder}/.index"
|
22
24
|
return path if FileTest.directory?(path)
|
23
25
|
|
data/spec/core.rb
ADDED
@@ -0,0 +1,32 @@
|
|
1
|
+
require_relative '../lib/treat'
|
2
|
+
|
3
|
+
describe Treat::Core::DataSet do
|
4
|
+
=begin
|
5
|
+
|
6
|
+
|
7
|
+
p = Problem(
|
8
|
+
Question(:is_key_sentence, :sentence, false),
|
9
|
+
Feature(:word_count, 0)
|
10
|
+
)
|
11
|
+
|
12
|
+
p2 = Problem(
|
13
|
+
Question(:is_key_sentence, :sentence, false),
|
14
|
+
Feature(:word_count, 0)
|
15
|
+
)
|
16
|
+
|
17
|
+
ds = DataSet(p)
|
18
|
+
|
19
|
+
text = Paragraph("Welcome to the zoo! This is a text.")
|
20
|
+
text2 = Paragraph("Welcome here my friend. This is well, a text.")
|
21
|
+
|
22
|
+
text.do :segment, :tokenize
|
23
|
+
text2.do :segment, :tokenize
|
24
|
+
|
25
|
+
ds1 = text.export(p)
|
26
|
+
ds2 = text2.export(p2)
|
27
|
+
|
28
|
+
ds1.merge(ds2)
|
29
|
+
|
30
|
+
puts ds1.inspect
|
31
|
+
=end
|
32
|
+
end
|
data/spec/sandbox.rb
ADDED
@@ -0,0 +1,13 @@
|
|
1
|
+
require_relative '../lib/treat'
|
2
|
+
|
3
|
+
Treat.databases.mongo.db = 'testingyetagain'
|
4
|
+
|
5
|
+
d = Document('http://www.cbc.ca/news/world/story/2012/07/21/syria-aleppo-clashes.html')
|
6
|
+
|
7
|
+
d2 = Document('http://www.cbc.ca/news/canada/british-columbia/story/2012/07/19/bc-monty-robinson-sentencing.html')
|
8
|
+
|
9
|
+
c = Collection('testing')
|
10
|
+
c << d
|
11
|
+
c << d2
|
12
|
+
|
13
|
+
c.serialize :mongo
|
metadata
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: treat
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 1.1.
|
4
|
+
version: 1.1.2
|
5
5
|
prerelease:
|
6
6
|
platform: ruby
|
7
7
|
authors:
|
@@ -9,7 +9,7 @@ authors:
|
|
9
9
|
autorequire:
|
10
10
|
bindir: bin
|
11
11
|
cert_chain: []
|
12
|
-
date: 2012-07-
|
12
|
+
date: 2012-07-23 00:00:00.000000000 Z
|
13
13
|
dependencies:
|
14
14
|
- !ruby/object:Gem::Dependency
|
15
15
|
name: schiphol
|
@@ -106,6 +106,7 @@ files:
|
|
106
106
|
- lib/treat/config/core/paths.rb
|
107
107
|
- lib/treat/config/core/syntax.rb
|
108
108
|
- lib/treat/config/core/verbosity.rb
|
109
|
+
- lib/treat/config/databases/default.rb
|
109
110
|
- lib/treat/config/databases/mongo.rb
|
110
111
|
- lib/treat/config/languages/agnostic.rb
|
111
112
|
- lib/treat/config/languages/arabic.rb
|
@@ -243,6 +244,7 @@ files:
|
|
243
244
|
- lib/treat/workers.rb
|
244
245
|
- lib/treat.rb
|
245
246
|
- spec/collection.rb
|
247
|
+
- spec/core.rb
|
246
248
|
- spec/document.rb
|
247
249
|
- spec/entity.rb
|
248
250
|
- spec/node.rb
|
@@ -252,12 +254,16 @@ files:
|
|
252
254
|
- spec/samples/mathematicians/gauss.pdf
|
253
255
|
- spec/samples/mathematicians/leibniz.txt
|
254
256
|
- spec/samples/mathematicians/newton.doc
|
257
|
+
- spec/sandbox.rb
|
255
258
|
- spec/token.rb
|
256
259
|
- spec/treat.rb
|
257
260
|
- spec/word.rb
|
258
261
|
- spec/zone.rb
|
259
262
|
- tmp/MANIFEST
|
263
|
+
- files/3_2_release_notes.html
|
264
|
+
- files/bc-monty-robinson-sentencing.html
|
260
265
|
- files/MANIFEST
|
266
|
+
- files/syria-aleppo-clashes.html
|
261
267
|
- README.md
|
262
268
|
- LICENSE
|
263
269
|
homepage: https://github.com/louismullie/treat
|