treat 1.0.0 → 1.0.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/README.md +1 -1
- data/lib/treat.rb +1 -1
- data/lib/treat/ai/classifiers/id3.rb +6 -4
- data/lib/treat/classification.rb +10 -11
- data/lib/treat/entities/abilities/buildable.rb +2 -3
- data/lib/treat/entities/abilities/countable.rb +7 -5
- data/lib/treat/entities/abilities/iterable.rb +9 -0
- data/lib/treat/entities/abilities/magical.rb +10 -5
- data/lib/treat/extractors/keywords/tf_idf.rb +21 -18
- data/lib/treat/loaders/stanford.rb +1 -1
- data/spec/document.rb +13 -0
- data/spec/sandbox.rb +114 -3
- metadata +2 -2
data/README.md
CHANGED
@@ -19,7 +19,7 @@ Treat is a toolkit for natural language processing and computational linguistics
|
|
19
19
|
|
20
20
|
**Resources**
|
21
21
|
|
22
|
-
* Read the [latest documentation](http://rubydoc.info/github/louismullie/treat/
|
22
|
+
* Read the [latest documentation](http://rubydoc.info/github/louismullie/treat/frames).
|
23
23
|
* See how to [install Treat](https://github.com/louismullie/treat/wiki/Installing-Treat).
|
24
24
|
* Learn how to [use Treat](https://github.com/louismullie/treat/wiki/Using-Treat).
|
25
25
|
* Help out by [contributing to the project](https://github.com/louismullie/treat/wiki/Contributing-to-Treat).
|
data/lib/treat.rb
CHANGED
@@ -5,22 +5,24 @@ class Treat::AI::Classifiers::ID3
|
|
5
5
|
@@classifiers = {}
|
6
6
|
|
7
7
|
def self.classify(entity, options = {})
|
8
|
-
|
8
|
+
|
9
9
|
set = options[:training]
|
10
10
|
cl = set.classification
|
11
11
|
|
12
12
|
if !@@classifiers[cl]
|
13
13
|
dec_tree = DecisionTree::ID3Tree.new(
|
14
|
-
set.labels, set.items,
|
15
|
-
cl.default,
|
14
|
+
set.labels.map { |l| l.to_s }, set.items,
|
15
|
+
cl.default, cl.mode)
|
16
16
|
dec_tree.train
|
17
17
|
else
|
18
18
|
dec_tree = @@classifiers[cl]
|
19
19
|
end
|
20
20
|
|
21
|
+
cl.export_item(entity, false).inspect
|
22
|
+
|
21
23
|
dec_tree.predict(
|
22
24
|
cl.export_item(entity, false)
|
23
|
-
)
|
25
|
+
)
|
24
26
|
|
25
27
|
end
|
26
28
|
|
data/lib/treat/classification.rb
CHANGED
@@ -4,18 +4,19 @@ class Treat::Classification
|
|
4
4
|
attr_reader :features
|
5
5
|
attr_reader :question
|
6
6
|
attr_reader :labels
|
7
|
+
attr_reader :mode
|
7
8
|
attr_reader :default
|
8
9
|
|
9
|
-
def initialize(type_or_types, feature_or_features,
|
10
|
-
|
10
|
+
def initialize(type_or_types, feature_or_features,
|
11
|
+
question, default = false, mode = :continuous)
|
11
12
|
@types, @features,
|
12
13
|
@question, @default =
|
13
14
|
[*type_or_types],
|
14
15
|
[*feature_or_features],
|
15
16
|
question, default
|
16
17
|
|
18
|
+
@mode = mode
|
17
19
|
@labels = []
|
18
|
-
|
19
20
|
@features.each do |cmd|
|
20
21
|
if cmd.is_a?(Array)
|
21
22
|
@labels << cmd[0]
|
@@ -23,11 +24,9 @@ class Treat::Classification
|
|
23
24
|
@labels << cmd
|
24
25
|
end
|
25
26
|
end
|
26
|
-
|
27
27
|
end
|
28
28
|
|
29
29
|
def export_item(e, include_question = true)
|
30
|
-
|
31
30
|
line = []
|
32
31
|
|
33
32
|
@features.each do |cmd|
|
@@ -46,16 +45,16 @@ class Treat::Classification
|
|
46
45
|
end
|
47
46
|
end
|
48
47
|
|
49
|
-
|
50
|
-
if
|
51
|
-
line << e.
|
48
|
+
if include_question
|
49
|
+
if e.has?(@question)
|
50
|
+
line << e.get(@question)
|
51
|
+
else
|
52
|
+
line << @default
|
52
53
|
end
|
53
|
-
rescue Treat::Exception
|
54
|
-
line << @default
|
55
54
|
end
|
55
|
+
|
56
56
|
line[-1] = '' if line[-1].nil?
|
57
57
|
line
|
58
|
-
|
59
58
|
end
|
60
59
|
|
61
60
|
end
|
@@ -95,7 +95,7 @@ module Treat::Entities::Abilities::Buildable
|
|
95
95
|
|
96
96
|
f = Treat::Downloader.download(
|
97
97
|
uri.scheme, uri.host, path, file)
|
98
|
-
options[:
|
98
|
+
options[:default_to] ||= :html
|
99
99
|
|
100
100
|
e = from_file(f, options)
|
101
101
|
e.set :url, url
|
@@ -170,9 +170,8 @@ module Treat::Entities::Abilities::Buildable
|
|
170
170
|
"point to a readable file."
|
171
171
|
end
|
172
172
|
|
173
|
-
dflt = options[:_default_format]
|
174
173
|
fmt = Treat::Formatters::Readers::Autoselect.
|
175
|
-
detect_format(file,
|
174
|
+
detect_format(file, options[:default_to])
|
176
175
|
options[:_format] = fmt
|
177
176
|
|
178
177
|
if fmt == :yaml || fmt == :yml ||
|
@@ -3,27 +3,29 @@ module Treat::Entities::Abilities::Countable
|
|
3
3
|
# Find the position of the current entity
|
4
4
|
# inside the parent entity, starting at 1.
|
5
5
|
def position
|
6
|
-
|
7
6
|
unless has_parent?
|
8
7
|
raise Treat::Exception,
|
9
8
|
"No parent to get position in."
|
10
9
|
end
|
11
|
-
|
12
10
|
parent.children.index(self) + 1
|
13
|
-
|
14
11
|
end
|
15
12
|
|
13
|
+
# Find the position of this entity from
|
14
|
+
# the end of the parent entity.
|
15
|
+
def position_from_end
|
16
|
+
p = position
|
17
|
+
parent.size - p
|
18
|
+
end
|
19
|
+
|
16
20
|
# Find the frequency of the entity in
|
17
21
|
# the supplied parent or in the root
|
18
22
|
# node if nil.
|
19
23
|
def frequency_in(parent_type = nil)
|
20
|
-
|
21
24
|
unless parent_type
|
22
25
|
root.registry[:value][id]
|
23
26
|
end
|
24
27
|
|
25
28
|
registry(parent_type)[:value][value]
|
26
|
-
|
27
29
|
end
|
28
30
|
|
29
31
|
# Get the frequency of this entity's
|
@@ -99,6 +99,15 @@ module Treat::Entities::Abilities::Iterable
|
|
99
99
|
|
100
100
|
alias :ancestors_with_type :ancestors_with_types
|
101
101
|
|
102
|
+
# Number of children that have a given feature.
|
103
|
+
def num_children_with_feature(feature)
|
104
|
+
i = 0
|
105
|
+
each do |c|
|
106
|
+
i += 1 if c.has?(feature)
|
107
|
+
end
|
108
|
+
i
|
109
|
+
end
|
110
|
+
|
102
111
|
# Return the first element in the array, warning if not
|
103
112
|
# the only one in the array. Used for magic methods: e.g.,
|
104
113
|
# the magic method "word" if called on a sentence with many
|
@@ -35,6 +35,9 @@ module Treat::Entities::Abilities::Magical
|
|
35
35
|
entities_with_type($1.intern)
|
36
36
|
elsif method =~ /^#{@@entities_regexp}$/
|
37
37
|
first_but_warn(entities_with_type($1.intern), $1)
|
38
|
+
elsif method =~ /^first_#{@@entities_regexp}$/
|
39
|
+
e = entities_with_type($1.intern)
|
40
|
+
e ? e[0] : nil
|
38
41
|
elsif method =~ /^parent_#{@@entities_regexp}$/
|
39
42
|
ancestor_with_type($1.intern)
|
40
43
|
elsif method =~ /^each_#{@@entities_regexp}$/
|
@@ -59,21 +62,23 @@ module Treat::Entities::Abilities::Magical
|
|
59
62
|
entities_with_category($1.intern)
|
60
63
|
elsif method =~ /^#{@@cats_regexp}$/
|
61
64
|
first_but_warn(entities_with_category($1.intern), $1)
|
65
|
+
elsif method =~ /^first_#{@@cats_regexp}$/
|
66
|
+
e = entities_with_category($1.intern)
|
67
|
+
e ? e[0] : nil
|
62
68
|
elsif method =~ /^#{@@cats_regexp}_count$/
|
63
69
|
entities_with_category($1.intern).size
|
70
|
+
elsif method =~ /^(.*)_count$/
|
71
|
+
num_children_with_feature($1.intern)
|
64
72
|
elsif method =~ /^#{@@cats_regexp}s_with_([a-z]*)$/
|
65
73
|
entities_with_feature($2.intern, args[0], $1)
|
66
74
|
elsif method =~ /^#{@@cats_regexp}_with_([a-z]*)$/
|
67
75
|
first_but_warn(entities_with_feature(
|
68
76
|
$2.intern, args[0], $1.intern), $1)
|
69
|
-
elsif method =~ /^([a-z]*)
|
70
|
-
f = send(
|
77
|
+
elsif method =~ /^([a-z]*)_of_(.*)$/
|
78
|
+
f = send($2.intern)
|
71
79
|
f ? f.send($1.intern) : nil
|
72
80
|
elsif method =~ /^frequency_in_#{@@entities_regexp}$/
|
73
81
|
frequency_in($1.intern)
|
74
|
-
# first_word
|
75
|
-
# tag_of_first_verb
|
76
|
-
# tag_of_title
|
77
82
|
else
|
78
83
|
return :no_magic
|
79
84
|
end
|
@@ -13,9 +13,9 @@ class Treat::Extractors::Keywords::TfIdf
|
|
13
13
|
|
14
14
|
options = DefaultOptions.merge(options)
|
15
15
|
tf_idfs = {}
|
16
|
+
|
16
17
|
entity.each_word do |word|
|
17
|
-
word.
|
18
|
-
tf_idfs[word] ||= word.get(:tf_idf)
|
18
|
+
tf_idfs[word] ||= word.tf_idf
|
19
19
|
end
|
20
20
|
|
21
21
|
tf_idfs = tf_idfs.
|
@@ -32,29 +32,32 @@ class Treat::Extractors::Keywords::TfIdf
|
|
32
32
|
|
33
33
|
w = word[0].to_s
|
34
34
|
next if keywords.include?(w)
|
35
|
-
|
36
|
-
entity.each_word_with_value(w) do |w2|
|
37
|
-
|
38
|
-
ps = w2.parent_phrase
|
39
|
-
|
40
|
-
if ps.has?(:keyword_count)
|
41
|
-
ps.set :keyword_count,
|
42
|
-
ps.keyword_count + 1
|
43
|
-
else
|
44
|
-
ps.set :keyword_count, 1
|
45
|
-
end
|
46
|
-
ps.set :keyword_density,
|
47
|
-
(ps.keyword_count / ps.size)
|
48
|
-
|
49
|
-
end
|
50
|
-
|
51
35
|
break if i > options[:number]
|
52
36
|
keywords << w
|
53
37
|
|
54
38
|
i += 1
|
55
39
|
end
|
56
40
|
|
41
|
+
entity.each_word do |word|
|
42
|
+
|
43
|
+
if keywords.include?(word.to_s)
|
44
|
+
word.set :is_keyword?, true
|
45
|
+
pp = entity.parent_phrase
|
46
|
+
next unless pp
|
47
|
+
if pp.has? :keyword_count
|
48
|
+
pp.set :keyword_count,
|
49
|
+
pp.keyword_count + 1
|
50
|
+
else
|
51
|
+
pp.set :keyword_count, 1
|
52
|
+
end
|
53
|
+
else
|
54
|
+
word.set :is_keyword?, false
|
55
|
+
end
|
56
|
+
|
57
|
+
end
|
58
|
+
|
57
59
|
keywords
|
60
|
+
|
58
61
|
end
|
59
62
|
|
60
63
|
end
|
data/spec/document.rb
CHANGED
@@ -36,11 +36,24 @@ describe Treat::Entities::Document do
|
|
36
36
|
"a document with the contents of the file" do
|
37
37
|
url = 'http://www.rubyinside.com/nethttp-cheat-sheet-2940.html'
|
38
38
|
d = Treat::Entities::Document.build(url)
|
39
|
+
d.format.should eql :html
|
40
|
+
d.print_tree
|
39
41
|
d.should be_an_instance_of Treat::Entities::Document
|
40
42
|
d.to_s.index('Rubyist').should_not eql nil
|
41
43
|
end
|
42
44
|
end
|
43
45
|
|
46
|
+
context "when supplied with a url with no file extension" do
|
47
|
+
it "downloads the file the URL points to and opens " +
|
48
|
+
"a document with the contents of the file, assuming " +
|
49
|
+
"the downloaded file to be in HTML format" do
|
50
|
+
url = 'http://www.economist.com/node/21552208'
|
51
|
+
d = Treat::Entities::Document.build(url)
|
52
|
+
d.should be_an_instance_of Treat::Entities::Document
|
53
|
+
d.to_s.index('Ronnie Lupe').should_not eql nil
|
54
|
+
end
|
55
|
+
end
|
56
|
+
|
44
57
|
context "when called with anything else than a " +
|
45
58
|
"readable file name or url" do
|
46
59
|
|
data/spec/sandbox.rb
CHANGED
@@ -1,5 +1,116 @@
|
|
1
|
+
=begin
|
1
2
|
require_relative '../lib/treat'
|
2
3
|
|
3
|
-
|
4
|
-
|
5
|
-
|
4
|
+
def extract(sentences, n)
|
5
|
+
sentences.to_a.values_at(
|
6
|
+
*sentences.values
|
7
|
+
.each_with_index
|
8
|
+
.sort.reverse
|
9
|
+
.map(&:last)
|
10
|
+
.sort.take(n))
|
11
|
+
.map(&:first)
|
12
|
+
end
|
13
|
+
|
14
|
+
describe "#summarize" do
|
15
|
+
|
16
|
+
it "provides a summary of the text" do
|
17
|
+
|
18
|
+
Treat.debug = true
|
19
|
+
Treat.silence = true
|
20
|
+
|
21
|
+
|
22
|
+
context = Treat::DataSet.open('economist-context.yml')
|
23
|
+
content = Treat::DataSet.open('economist-content.yml')
|
24
|
+
|
25
|
+
c = Collection (Treat.spec + 'economist')
|
26
|
+
c.do :chunk, :segment, :tokenize, :tag, :name_tag, :tf_idf, :keywords
|
27
|
+
|
28
|
+
c.each_document do |d|
|
29
|
+
|
30
|
+
sentences = {}
|
31
|
+
|
32
|
+
d.each_sentence do |sentence|
|
33
|
+
cx = sentence.classify(:training => context)
|
34
|
+
ct = sentence.classify(:training => content)
|
35
|
+
sentences[sentence] = cx[1] + ct[1]
|
36
|
+
end
|
37
|
+
|
38
|
+
puts
|
39
|
+
puts d.titles[0].to_s
|
40
|
+
puts
|
41
|
+
|
42
|
+
puts extract(sentences, 5).map { |p| p.to_s }.join(' ')
|
43
|
+
|
44
|
+
end
|
45
|
+
|
46
|
+
c.serialize file: 'economist-coll.yaml'
|
47
|
+
|
48
|
+
c = Collection (Treat.spec + 'economist')
|
49
|
+
c.do :chunk, :segment, :tokenize, :tag, :name_tag, :tf_idf, :keywords
|
50
|
+
|
51
|
+
# Topic word count ? Synonyms of keywords ?
|
52
|
+
# Time expressions?
|
53
|
+
classify_content = Treat::Classification.new(
|
54
|
+
:phrase,
|
55
|
+
[:word_count, :number_count,
|
56
|
+
:keyword_count, :name_tag_count],
|
57
|
+
:has_key_content?
|
58
|
+
)
|
59
|
+
|
60
|
+
classify_context = Treat::Classification.new(
|
61
|
+
:phrase,
|
62
|
+
[:position,
|
63
|
+
:position_from_end,
|
64
|
+
:type_of_parent_zone,
|
65
|
+
:value_of_first_word,
|
66
|
+
:tag_of_first_word
|
67
|
+
],
|
68
|
+
:has_key_context?,
|
69
|
+
false,
|
70
|
+
:discrete
|
71
|
+
)
|
72
|
+
|
73
|
+
c.each_sentence do |s|
|
74
|
+
puts s.to_s
|
75
|
+
a = STDIN.gets.to_s.strip
|
76
|
+
if a == ''
|
77
|
+
s.set :has_key_content?, false
|
78
|
+
s.set :has_key_context?, false
|
79
|
+
else
|
80
|
+
s.set :has_key_content?, true
|
81
|
+
s.set :has_key_context?, true
|
82
|
+
end
|
83
|
+
end
|
84
|
+
|
85
|
+
context = c.export(classify_context)
|
86
|
+
content = c.export(classify_content)
|
87
|
+
|
88
|
+
context.save('economist-context.yml')
|
89
|
+
content.save('economist-content.yml')
|
90
|
+
|
91
|
+
context = Treat::DataSet.open('economist-context.yml')
|
92
|
+
content = Treat::DataSet.open('economist-content.yml')
|
93
|
+
|
94
|
+
c.each_document do |d|
|
95
|
+
|
96
|
+
sentences = {}
|
97
|
+
|
98
|
+
d.each_sentence do |sentence|
|
99
|
+
cx = sentence.classify(:training => context)
|
100
|
+
ct = sentence.classify(:training => content)
|
101
|
+
sentences[sentence] = cx[1] + ct[1]
|
102
|
+
end
|
103
|
+
|
104
|
+
puts
|
105
|
+
puts d.titles[0].to_s
|
106
|
+
puts
|
107
|
+
|
108
|
+
puts extract(sentences, 5).map { |p| p.to_s }.join(' ')
|
109
|
+
|
110
|
+
end
|
111
|
+
|
112
|
+
|
113
|
+
end
|
114
|
+
|
115
|
+
end
|
116
|
+
=end
|
metadata
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: treat
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 1.0.
|
4
|
+
version: 1.0.1
|
5
5
|
prerelease:
|
6
6
|
platform: ruby
|
7
7
|
authors:
|
@@ -9,7 +9,7 @@ authors:
|
|
9
9
|
autorequire:
|
10
10
|
bindir: bin
|
11
11
|
cert_chain: []
|
12
|
-
date: 2012-04-
|
12
|
+
date: 2012-04-07 00:00:00.000000000 Z
|
13
13
|
dependencies:
|
14
14
|
- !ruby/object:Gem::Dependency
|
15
15
|
name: rubyzip
|