treat 1.0.0 → 1.0.1
Sign up to get free protection for your applications and to get access to all the features.
- data/README.md +1 -1
- data/lib/treat.rb +1 -1
- data/lib/treat/ai/classifiers/id3.rb +6 -4
- data/lib/treat/classification.rb +10 -11
- data/lib/treat/entities/abilities/buildable.rb +2 -3
- data/lib/treat/entities/abilities/countable.rb +7 -5
- data/lib/treat/entities/abilities/iterable.rb +9 -0
- data/lib/treat/entities/abilities/magical.rb +10 -5
- data/lib/treat/extractors/keywords/tf_idf.rb +21 -18
- data/lib/treat/loaders/stanford.rb +1 -1
- data/spec/document.rb +13 -0
- data/spec/sandbox.rb +114 -3
- metadata +2 -2
data/README.md
CHANGED
@@ -19,7 +19,7 @@ Treat is a toolkit for natural language processing and computational linguistics
|
|
19
19
|
|
20
20
|
**Resources**
|
21
21
|
|
22
|
-
* Read the [latest documentation](http://rubydoc.info/github/louismullie/treat/
|
22
|
+
* Read the [latest documentation](http://rubydoc.info/github/louismullie/treat/frames).
|
23
23
|
* See how to [install Treat](https://github.com/louismullie/treat/wiki/Installing-Treat).
|
24
24
|
* Learn how to [use Treat](https://github.com/louismullie/treat/wiki/Using-Treat).
|
25
25
|
* Help out by [contributing to the project](https://github.com/louismullie/treat/wiki/Contributing-to-Treat).
|
data/lib/treat.rb
CHANGED
@@ -5,22 +5,24 @@ class Treat::AI::Classifiers::ID3
|
|
5
5
|
@@classifiers = {}
|
6
6
|
|
7
7
|
def self.classify(entity, options = {})
|
8
|
-
|
8
|
+
|
9
9
|
set = options[:training]
|
10
10
|
cl = set.classification
|
11
11
|
|
12
12
|
if !@@classifiers[cl]
|
13
13
|
dec_tree = DecisionTree::ID3Tree.new(
|
14
|
-
set.labels, set.items,
|
15
|
-
cl.default,
|
14
|
+
set.labels.map { |l| l.to_s }, set.items,
|
15
|
+
cl.default, cl.mode)
|
16
16
|
dec_tree.train
|
17
17
|
else
|
18
18
|
dec_tree = @@classifiers[cl]
|
19
19
|
end
|
20
20
|
|
21
|
+
cl.export_item(entity, false).inspect
|
22
|
+
|
21
23
|
dec_tree.predict(
|
22
24
|
cl.export_item(entity, false)
|
23
|
-
)
|
25
|
+
)
|
24
26
|
|
25
27
|
end
|
26
28
|
|
data/lib/treat/classification.rb
CHANGED
@@ -4,18 +4,19 @@ class Treat::Classification
|
|
4
4
|
attr_reader :features
|
5
5
|
attr_reader :question
|
6
6
|
attr_reader :labels
|
7
|
+
attr_reader :mode
|
7
8
|
attr_reader :default
|
8
9
|
|
9
|
-
def initialize(type_or_types, feature_or_features,
|
10
|
-
|
10
|
+
def initialize(type_or_types, feature_or_features,
|
11
|
+
question, default = false, mode = :continuous)
|
11
12
|
@types, @features,
|
12
13
|
@question, @default =
|
13
14
|
[*type_or_types],
|
14
15
|
[*feature_or_features],
|
15
16
|
question, default
|
16
17
|
|
18
|
+
@mode = mode
|
17
19
|
@labels = []
|
18
|
-
|
19
20
|
@features.each do |cmd|
|
20
21
|
if cmd.is_a?(Array)
|
21
22
|
@labels << cmd[0]
|
@@ -23,11 +24,9 @@ class Treat::Classification
|
|
23
24
|
@labels << cmd
|
24
25
|
end
|
25
26
|
end
|
26
|
-
|
27
27
|
end
|
28
28
|
|
29
29
|
def export_item(e, include_question = true)
|
30
|
-
|
31
30
|
line = []
|
32
31
|
|
33
32
|
@features.each do |cmd|
|
@@ -46,16 +45,16 @@ class Treat::Classification
|
|
46
45
|
end
|
47
46
|
end
|
48
47
|
|
49
|
-
|
50
|
-
if
|
51
|
-
line << e.
|
48
|
+
if include_question
|
49
|
+
if e.has?(@question)
|
50
|
+
line << e.get(@question)
|
51
|
+
else
|
52
|
+
line << @default
|
52
53
|
end
|
53
|
-
rescue Treat::Exception
|
54
|
-
line << @default
|
55
54
|
end
|
55
|
+
|
56
56
|
line[-1] = '' if line[-1].nil?
|
57
57
|
line
|
58
|
-
|
59
58
|
end
|
60
59
|
|
61
60
|
end
|
@@ -95,7 +95,7 @@ module Treat::Entities::Abilities::Buildable
|
|
95
95
|
|
96
96
|
f = Treat::Downloader.download(
|
97
97
|
uri.scheme, uri.host, path, file)
|
98
|
-
options[:
|
98
|
+
options[:default_to] ||= :html
|
99
99
|
|
100
100
|
e = from_file(f, options)
|
101
101
|
e.set :url, url
|
@@ -170,9 +170,8 @@ module Treat::Entities::Abilities::Buildable
|
|
170
170
|
"point to a readable file."
|
171
171
|
end
|
172
172
|
|
173
|
-
dflt = options[:_default_format]
|
174
173
|
fmt = Treat::Formatters::Readers::Autoselect.
|
175
|
-
detect_format(file,
|
174
|
+
detect_format(file, options[:default_to])
|
176
175
|
options[:_format] = fmt
|
177
176
|
|
178
177
|
if fmt == :yaml || fmt == :yml ||
|
@@ -3,27 +3,29 @@ module Treat::Entities::Abilities::Countable
|
|
3
3
|
# Find the position of the current entity
|
4
4
|
# inside the parent entity, starting at 1.
|
5
5
|
def position
|
6
|
-
|
7
6
|
unless has_parent?
|
8
7
|
raise Treat::Exception,
|
9
8
|
"No parent to get position in."
|
10
9
|
end
|
11
|
-
|
12
10
|
parent.children.index(self) + 1
|
13
|
-
|
14
11
|
end
|
15
12
|
|
13
|
+
# Find the position of this entity from
|
14
|
+
# the end of the parent entity.
|
15
|
+
def position_from_end
|
16
|
+
p = position
|
17
|
+
parent.size - p
|
18
|
+
end
|
19
|
+
|
16
20
|
# Find the frequency of the entity in
|
17
21
|
# the supplied parent or in the root
|
18
22
|
# node if nil.
|
19
23
|
def frequency_in(parent_type = nil)
|
20
|
-
|
21
24
|
unless parent_type
|
22
25
|
root.registry[:value][id]
|
23
26
|
end
|
24
27
|
|
25
28
|
registry(parent_type)[:value][value]
|
26
|
-
|
27
29
|
end
|
28
30
|
|
29
31
|
# Get the frequency of this entity's
|
@@ -99,6 +99,15 @@ module Treat::Entities::Abilities::Iterable
|
|
99
99
|
|
100
100
|
alias :ancestors_with_type :ancestors_with_types
|
101
101
|
|
102
|
+
# Number of children that have a given feature.
|
103
|
+
def num_children_with_feature(feature)
|
104
|
+
i = 0
|
105
|
+
each do |c|
|
106
|
+
i += 1 if c.has?(feature)
|
107
|
+
end
|
108
|
+
i
|
109
|
+
end
|
110
|
+
|
102
111
|
# Return the first element in the array, warning if not
|
103
112
|
# the only one in the array. Used for magic methods: e.g.,
|
104
113
|
# the magic method "word" if called on a sentence with many
|
@@ -35,6 +35,9 @@ module Treat::Entities::Abilities::Magical
|
|
35
35
|
entities_with_type($1.intern)
|
36
36
|
elsif method =~ /^#{@@entities_regexp}$/
|
37
37
|
first_but_warn(entities_with_type($1.intern), $1)
|
38
|
+
elsif method =~ /^first_#{@@entities_regexp}$/
|
39
|
+
e = entities_with_type($1.intern)
|
40
|
+
e ? e[0] : nil
|
38
41
|
elsif method =~ /^parent_#{@@entities_regexp}$/
|
39
42
|
ancestor_with_type($1.intern)
|
40
43
|
elsif method =~ /^each_#{@@entities_regexp}$/
|
@@ -59,21 +62,23 @@ module Treat::Entities::Abilities::Magical
|
|
59
62
|
entities_with_category($1.intern)
|
60
63
|
elsif method =~ /^#{@@cats_regexp}$/
|
61
64
|
first_but_warn(entities_with_category($1.intern), $1)
|
65
|
+
elsif method =~ /^first_#{@@cats_regexp}$/
|
66
|
+
e = entities_with_category($1.intern)
|
67
|
+
e ? e[0] : nil
|
62
68
|
elsif method =~ /^#{@@cats_regexp}_count$/
|
63
69
|
entities_with_category($1.intern).size
|
70
|
+
elsif method =~ /^(.*)_count$/
|
71
|
+
num_children_with_feature($1.intern)
|
64
72
|
elsif method =~ /^#{@@cats_regexp}s_with_([a-z]*)$/
|
65
73
|
entities_with_feature($2.intern, args[0], $1)
|
66
74
|
elsif method =~ /^#{@@cats_regexp}_with_([a-z]*)$/
|
67
75
|
first_but_warn(entities_with_feature(
|
68
76
|
$2.intern, args[0], $1.intern), $1)
|
69
|
-
elsif method =~ /^([a-z]*)
|
70
|
-
f = send(
|
77
|
+
elsif method =~ /^([a-z]*)_of_(.*)$/
|
78
|
+
f = send($2.intern)
|
71
79
|
f ? f.send($1.intern) : nil
|
72
80
|
elsif method =~ /^frequency_in_#{@@entities_regexp}$/
|
73
81
|
frequency_in($1.intern)
|
74
|
-
# first_word
|
75
|
-
# tag_of_first_verb
|
76
|
-
# tag_of_title
|
77
82
|
else
|
78
83
|
return :no_magic
|
79
84
|
end
|
@@ -13,9 +13,9 @@ class Treat::Extractors::Keywords::TfIdf
|
|
13
13
|
|
14
14
|
options = DefaultOptions.merge(options)
|
15
15
|
tf_idfs = {}
|
16
|
+
|
16
17
|
entity.each_word do |word|
|
17
|
-
word.
|
18
|
-
tf_idfs[word] ||= word.get(:tf_idf)
|
18
|
+
tf_idfs[word] ||= word.tf_idf
|
19
19
|
end
|
20
20
|
|
21
21
|
tf_idfs = tf_idfs.
|
@@ -32,29 +32,32 @@ class Treat::Extractors::Keywords::TfIdf
|
|
32
32
|
|
33
33
|
w = word[0].to_s
|
34
34
|
next if keywords.include?(w)
|
35
|
-
|
36
|
-
entity.each_word_with_value(w) do |w2|
|
37
|
-
|
38
|
-
ps = w2.parent_phrase
|
39
|
-
|
40
|
-
if ps.has?(:keyword_count)
|
41
|
-
ps.set :keyword_count,
|
42
|
-
ps.keyword_count + 1
|
43
|
-
else
|
44
|
-
ps.set :keyword_count, 1
|
45
|
-
end
|
46
|
-
ps.set :keyword_density,
|
47
|
-
(ps.keyword_count / ps.size)
|
48
|
-
|
49
|
-
end
|
50
|
-
|
51
35
|
break if i > options[:number]
|
52
36
|
keywords << w
|
53
37
|
|
54
38
|
i += 1
|
55
39
|
end
|
56
40
|
|
41
|
+
entity.each_word do |word|
|
42
|
+
|
43
|
+
if keywords.include?(word.to_s)
|
44
|
+
word.set :is_keyword?, true
|
45
|
+
pp = entity.parent_phrase
|
46
|
+
next unless pp
|
47
|
+
if pp.has? :keyword_count
|
48
|
+
pp.set :keyword_count,
|
49
|
+
pp.keyword_count + 1
|
50
|
+
else
|
51
|
+
pp.set :keyword_count, 1
|
52
|
+
end
|
53
|
+
else
|
54
|
+
word.set :is_keyword?, false
|
55
|
+
end
|
56
|
+
|
57
|
+
end
|
58
|
+
|
57
59
|
keywords
|
60
|
+
|
58
61
|
end
|
59
62
|
|
60
63
|
end
|
data/spec/document.rb
CHANGED
@@ -36,11 +36,24 @@ describe Treat::Entities::Document do
|
|
36
36
|
"a document with the contents of the file" do
|
37
37
|
url = 'http://www.rubyinside.com/nethttp-cheat-sheet-2940.html'
|
38
38
|
d = Treat::Entities::Document.build(url)
|
39
|
+
d.format.should eql :html
|
40
|
+
d.print_tree
|
39
41
|
d.should be_an_instance_of Treat::Entities::Document
|
40
42
|
d.to_s.index('Rubyist').should_not eql nil
|
41
43
|
end
|
42
44
|
end
|
43
45
|
|
46
|
+
context "when supplied with a url with no file extension" do
|
47
|
+
it "downloads the file the URL points to and opens " +
|
48
|
+
"a document with the contents of the file, assuming " +
|
49
|
+
"the downloaded file to be in HTML format" do
|
50
|
+
url = 'http://www.economist.com/node/21552208'
|
51
|
+
d = Treat::Entities::Document.build(url)
|
52
|
+
d.should be_an_instance_of Treat::Entities::Document
|
53
|
+
d.to_s.index('Ronnie Lupe').should_not eql nil
|
54
|
+
end
|
55
|
+
end
|
56
|
+
|
44
57
|
context "when called with anything else than a " +
|
45
58
|
"readable file name or url" do
|
46
59
|
|
data/spec/sandbox.rb
CHANGED
@@ -1,5 +1,116 @@
|
|
1
|
+
=begin
|
1
2
|
require_relative '../lib/treat'
|
2
3
|
|
3
|
-
|
4
|
-
|
5
|
-
|
4
|
+
def extract(sentences, n)
|
5
|
+
sentences.to_a.values_at(
|
6
|
+
*sentences.values
|
7
|
+
.each_with_index
|
8
|
+
.sort.reverse
|
9
|
+
.map(&:last)
|
10
|
+
.sort.take(n))
|
11
|
+
.map(&:first)
|
12
|
+
end
|
13
|
+
|
14
|
+
describe "#summarize" do
|
15
|
+
|
16
|
+
it "provides a summary of the text" do
|
17
|
+
|
18
|
+
Treat.debug = true
|
19
|
+
Treat.silence = true
|
20
|
+
|
21
|
+
|
22
|
+
context = Treat::DataSet.open('economist-context.yml')
|
23
|
+
content = Treat::DataSet.open('economist-content.yml')
|
24
|
+
|
25
|
+
c = Collection (Treat.spec + 'economist')
|
26
|
+
c.do :chunk, :segment, :tokenize, :tag, :name_tag, :tf_idf, :keywords
|
27
|
+
|
28
|
+
c.each_document do |d|
|
29
|
+
|
30
|
+
sentences = {}
|
31
|
+
|
32
|
+
d.each_sentence do |sentence|
|
33
|
+
cx = sentence.classify(:training => context)
|
34
|
+
ct = sentence.classify(:training => content)
|
35
|
+
sentences[sentence] = cx[1] + ct[1]
|
36
|
+
end
|
37
|
+
|
38
|
+
puts
|
39
|
+
puts d.titles[0].to_s
|
40
|
+
puts
|
41
|
+
|
42
|
+
puts extract(sentences, 5).map { |p| p.to_s }.join(' ')
|
43
|
+
|
44
|
+
end
|
45
|
+
|
46
|
+
c.serialize file: 'economist-coll.yaml'
|
47
|
+
|
48
|
+
c = Collection (Treat.spec + 'economist')
|
49
|
+
c.do :chunk, :segment, :tokenize, :tag, :name_tag, :tf_idf, :keywords
|
50
|
+
|
51
|
+
# Topic word count ? Synonyms of keywords ?
|
52
|
+
# Time expressions?
|
53
|
+
classify_content = Treat::Classification.new(
|
54
|
+
:phrase,
|
55
|
+
[:word_count, :number_count,
|
56
|
+
:keyword_count, :name_tag_count],
|
57
|
+
:has_key_content?
|
58
|
+
)
|
59
|
+
|
60
|
+
classify_context = Treat::Classification.new(
|
61
|
+
:phrase,
|
62
|
+
[:position,
|
63
|
+
:position_from_end,
|
64
|
+
:type_of_parent_zone,
|
65
|
+
:value_of_first_word,
|
66
|
+
:tag_of_first_word
|
67
|
+
],
|
68
|
+
:has_key_context?,
|
69
|
+
false,
|
70
|
+
:discrete
|
71
|
+
)
|
72
|
+
|
73
|
+
c.each_sentence do |s|
|
74
|
+
puts s.to_s
|
75
|
+
a = STDIN.gets.to_s.strip
|
76
|
+
if a == ''
|
77
|
+
s.set :has_key_content?, false
|
78
|
+
s.set :has_key_context?, false
|
79
|
+
else
|
80
|
+
s.set :has_key_content?, true
|
81
|
+
s.set :has_key_context?, true
|
82
|
+
end
|
83
|
+
end
|
84
|
+
|
85
|
+
context = c.export(classify_context)
|
86
|
+
content = c.export(classify_content)
|
87
|
+
|
88
|
+
context.save('economist-context.yml')
|
89
|
+
content.save('economist-content.yml')
|
90
|
+
|
91
|
+
context = Treat::DataSet.open('economist-context.yml')
|
92
|
+
content = Treat::DataSet.open('economist-content.yml')
|
93
|
+
|
94
|
+
c.each_document do |d|
|
95
|
+
|
96
|
+
sentences = {}
|
97
|
+
|
98
|
+
d.each_sentence do |sentence|
|
99
|
+
cx = sentence.classify(:training => context)
|
100
|
+
ct = sentence.classify(:training => content)
|
101
|
+
sentences[sentence] = cx[1] + ct[1]
|
102
|
+
end
|
103
|
+
|
104
|
+
puts
|
105
|
+
puts d.titles[0].to_s
|
106
|
+
puts
|
107
|
+
|
108
|
+
puts extract(sentences, 5).map { |p| p.to_s }.join(' ')
|
109
|
+
|
110
|
+
end
|
111
|
+
|
112
|
+
|
113
|
+
end
|
114
|
+
|
115
|
+
end
|
116
|
+
=end
|
metadata
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: treat
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 1.0.
|
4
|
+
version: 1.0.1
|
5
5
|
prerelease:
|
6
6
|
platform: ruby
|
7
7
|
authors:
|
@@ -9,7 +9,7 @@ authors:
|
|
9
9
|
autorequire:
|
10
10
|
bindir: bin
|
11
11
|
cert_chain: []
|
12
|
-
date: 2012-04-
|
12
|
+
date: 2012-04-07 00:00:00.000000000 Z
|
13
13
|
dependencies:
|
14
14
|
- !ruby/object:Gem::Dependency
|
15
15
|
name: rubyzip
|