treat 1.0.0 → 1.0.1

Sign up to get free protection for your applications and to get access to all the features.
data/README.md CHANGED
@@ -19,7 +19,7 @@ Treat is a toolkit for natural language processing and computational linguistics
19
19
 
20
20
  **Resources**
21
21
 
22
- * Read the [latest documentation](http://rubydoc.info/github/louismullie/treat/master/frames).
22
+ * Read the [latest documentation](http://rubydoc.info/github/louismullie/treat/frames).
23
23
  * See how to [install Treat](https://github.com/louismullie/treat/wiki/Installing-Treat).
24
24
  * Learn how to [use Treat](https://github.com/louismullie/treat/wiki/Using-Treat).
25
25
  * Help out by [contributing to the project](https://github.com/louismullie/treat/wiki/Contributing-to-Treat).
@@ -10,7 +10,7 @@ module Treat
10
10
  end
11
11
 
12
12
  # The current version of Treat.
13
- VERSION = "1.0.0"
13
+ VERSION = "1.0.1"
14
14
 
15
15
  # Add methods to handle syntactic sugar,
16
16
  # language configuration options, and paths.
@@ -5,22 +5,24 @@ class Treat::AI::Classifiers::ID3
5
5
  @@classifiers = {}
6
6
 
7
7
  def self.classify(entity, options = {})
8
-
8
+
9
9
  set = options[:training]
10
10
  cl = set.classification
11
11
 
12
12
  if !@@classifiers[cl]
13
13
  dec_tree = DecisionTree::ID3Tree.new(
14
- set.labels, set.items,
15
- cl.default, :continuous)
14
+ set.labels.map { |l| l.to_s }, set.items,
15
+ cl.default, cl.mode)
16
16
  dec_tree.train
17
17
  else
18
18
  dec_tree = @@classifiers[cl]
19
19
  end
20
20
 
21
+ cl.export_item(entity, false).inspect
22
+
21
23
  dec_tree.predict(
22
24
  cl.export_item(entity, false)
23
- )[0]
25
+ )
24
26
 
25
27
  end
26
28
 
@@ -4,18 +4,19 @@ class Treat::Classification
4
4
  attr_reader :features
5
5
  attr_reader :question
6
6
  attr_reader :labels
7
+ attr_reader :mode
7
8
  attr_reader :default
8
9
 
9
- def initialize(type_or_types, feature_or_features, question, default = false)
10
-
10
+ def initialize(type_or_types, feature_or_features,
11
+ question, default = false, mode = :continuous)
11
12
  @types, @features,
12
13
  @question, @default =
13
14
  [*type_or_types],
14
15
  [*feature_or_features],
15
16
  question, default
16
17
 
18
+ @mode = mode
17
19
  @labels = []
18
-
19
20
  @features.each do |cmd|
20
21
  if cmd.is_a?(Array)
21
22
  @labels << cmd[0]
@@ -23,11 +24,9 @@ class Treat::Classification
23
24
  @labels << cmd
24
25
  end
25
26
  end
26
-
27
27
  end
28
28
 
29
29
  def export_item(e, include_question = true)
30
-
31
30
  line = []
32
31
 
33
32
  @features.each do |cmd|
@@ -46,16 +45,16 @@ class Treat::Classification
46
45
  end
47
46
  end
48
47
 
49
- begin
50
- if include_question
51
- line << e.send(@question)
48
+ if include_question
49
+ if e.has?(@question)
50
+ line << e.get(@question)
51
+ else
52
+ line << @default
52
53
  end
53
- rescue Treat::Exception
54
- line << @default
55
54
  end
55
+
56
56
  line[-1] = '' if line[-1].nil?
57
57
  line
58
-
59
58
  end
60
59
 
61
60
  end
@@ -95,7 +95,7 @@ module Treat::Entities::Abilities::Buildable
95
95
 
96
96
  f = Treat::Downloader.download(
97
97
  uri.scheme, uri.host, path, file)
98
- options[:_default_format] = :html
98
+ options[:default_to] ||= :html
99
99
 
100
100
  e = from_file(f, options)
101
101
  e.set :url, url
@@ -170,9 +170,8 @@ module Treat::Entities::Abilities::Buildable
170
170
  "point to a readable file."
171
171
  end
172
172
 
173
- dflt = options[:_default_format]
174
173
  fmt = Treat::Formatters::Readers::Autoselect.
175
- detect_format(file, dflt)
174
+ detect_format(file, options[:default_to])
176
175
  options[:_format] = fmt
177
176
 
178
177
  if fmt == :yaml || fmt == :yml ||
@@ -3,27 +3,29 @@ module Treat::Entities::Abilities::Countable
3
3
  # Find the position of the current entity
4
4
  # inside the parent entity, starting at 1.
5
5
  def position
6
-
7
6
  unless has_parent?
8
7
  raise Treat::Exception,
9
8
  "No parent to get position in."
10
9
  end
11
-
12
10
  parent.children.index(self) + 1
13
-
14
11
  end
15
12
 
13
+ # Find the position of this entity from
14
+ # the end of the parent entity.
15
+ def position_from_end
16
+ p = position
17
+ parent.size - p
18
+ end
19
+
16
20
  # Find the frequency of the entity in
17
21
  # the supplied parent or in the root
18
22
  # node if nil.
19
23
  def frequency_in(parent_type = nil)
20
-
21
24
  unless parent_type
22
25
  root.registry[:value][id]
23
26
  end
24
27
 
25
28
  registry(parent_type)[:value][value]
26
-
27
29
  end
28
30
 
29
31
  # Get the frequency of this entity's
@@ -99,6 +99,15 @@ module Treat::Entities::Abilities::Iterable
99
99
 
100
100
  alias :ancestors_with_type :ancestors_with_types
101
101
 
102
+ # Number of children that have a given feature.
103
+ def num_children_with_feature(feature)
104
+ i = 0
105
+ each do |c|
106
+ i += 1 if c.has?(feature)
107
+ end
108
+ i
109
+ end
110
+
102
111
  # Return the first element in the array, warning if not
103
112
  # the only one in the array. Used for magic methods: e.g.,
104
113
  # the magic method "word" if called on a sentence with many
@@ -35,6 +35,9 @@ module Treat::Entities::Abilities::Magical
35
35
  entities_with_type($1.intern)
36
36
  elsif method =~ /^#{@@entities_regexp}$/
37
37
  first_but_warn(entities_with_type($1.intern), $1)
38
+ elsif method =~ /^first_#{@@entities_regexp}$/
39
+ e = entities_with_type($1.intern)
40
+ e ? e[0] : nil
38
41
  elsif method =~ /^parent_#{@@entities_regexp}$/
39
42
  ancestor_with_type($1.intern)
40
43
  elsif method =~ /^each_#{@@entities_regexp}$/
@@ -59,21 +62,23 @@ module Treat::Entities::Abilities::Magical
59
62
  entities_with_category($1.intern)
60
63
  elsif method =~ /^#{@@cats_regexp}$/
61
64
  first_but_warn(entities_with_category($1.intern), $1)
65
+ elsif method =~ /^first_#{@@cats_regexp}$/
66
+ e = entities_with_category($1.intern)
67
+ e ? e[0] : nil
62
68
  elsif method =~ /^#{@@cats_regexp}_count$/
63
69
  entities_with_category($1.intern).size
70
+ elsif method =~ /^(.*)_count$/
71
+ num_children_with_feature($1.intern)
64
72
  elsif method =~ /^#{@@cats_regexp}s_with_([a-z]*)$/
65
73
  entities_with_feature($2.intern, args[0], $1)
66
74
  elsif method =~ /^#{@@cats_regexp}_with_([a-z]*)$/
67
75
  first_but_warn(entities_with_feature(
68
76
  $2.intern, args[0], $1.intern), $1)
69
- elsif method =~ /^([a-z]*)_of_first_#{@@entities_regexp}$/
70
- f = send(:"#{$2}s".intern).first
77
+ elsif method =~ /^([a-z]*)_of_(.*)$/
78
+ f = send($2.intern)
71
79
  f ? f.send($1.intern) : nil
72
80
  elsif method =~ /^frequency_in_#{@@entities_regexp}$/
73
81
  frequency_in($1.intern)
74
- # first_word
75
- # tag_of_first_verb
76
- # tag_of_title
77
82
  else
78
83
  return :no_magic
79
84
  end
@@ -13,9 +13,9 @@ class Treat::Extractors::Keywords::TfIdf
13
13
 
14
14
  options = DefaultOptions.merge(options)
15
15
  tf_idfs = {}
16
+
16
17
  entity.each_word do |word|
17
- word.check_has(:tf_idf, false)
18
- tf_idfs[word] ||= word.get(:tf_idf)
18
+ tf_idfs[word] ||= word.tf_idf
19
19
  end
20
20
 
21
21
  tf_idfs = tf_idfs.
@@ -32,29 +32,32 @@ class Treat::Extractors::Keywords::TfIdf
32
32
 
33
33
  w = word[0].to_s
34
34
  next if keywords.include?(w)
35
-
36
- entity.each_word_with_value(w) do |w2|
37
-
38
- ps = w2.parent_phrase
39
-
40
- if ps.has?(:keyword_count)
41
- ps.set :keyword_count,
42
- ps.keyword_count + 1
43
- else
44
- ps.set :keyword_count, 1
45
- end
46
- ps.set :keyword_density,
47
- (ps.keyword_count / ps.size)
48
-
49
- end
50
-
51
35
  break if i > options[:number]
52
36
  keywords << w
53
37
 
54
38
  i += 1
55
39
  end
56
40
 
41
+ entity.each_word do |word|
42
+
43
+ if keywords.include?(word.to_s)
44
+ word.set :is_keyword?, true
45
+ pp = entity.parent_phrase
46
+ next unless pp
47
+ if pp.has? :keyword_count
48
+ pp.set :keyword_count,
49
+ pp.keyword_count + 1
50
+ else
51
+ pp.set :keyword_count, 1
52
+ end
53
+ else
54
+ word.set :is_keyword?, false
55
+ end
56
+
57
+ end
58
+
57
59
  keywords
60
+
58
61
  end
59
62
 
60
63
  end
@@ -19,7 +19,7 @@ class Treat::Loaders
19
19
  StanfordCoreNLP.log_file =
20
20
  NULL_DEVICE if Treat.silence
21
21
 
22
- StanfordCoreNLP.init
22
+ StanfordCoreNLP.bind
23
23
  @@loaded = true
24
24
 
25
25
  end
@@ -36,11 +36,24 @@ describe Treat::Entities::Document do
36
36
  "a document with the contents of the file" do
37
37
  url = 'http://www.rubyinside.com/nethttp-cheat-sheet-2940.html'
38
38
  d = Treat::Entities::Document.build(url)
39
+ d.format.should eql :html
40
+ d.print_tree
39
41
  d.should be_an_instance_of Treat::Entities::Document
40
42
  d.to_s.index('Rubyist').should_not eql nil
41
43
  end
42
44
  end
43
45
 
46
+ context "when supplied with a url with no file extension" do
47
+ it "downloads the file the URL points to and opens " +
48
+ "a document with the contents of the file, assuming " +
49
+ "the downloaded file to be in HTML format" do
50
+ url = 'http://www.economist.com/node/21552208'
51
+ d = Treat::Entities::Document.build(url)
52
+ d.should be_an_instance_of Treat::Entities::Document
53
+ d.to_s.index('Ronnie Lupe').should_not eql nil
54
+ end
55
+ end
56
+
44
57
  context "when called with anything else than a " +
45
58
  "readable file name or url" do
46
59
 
@@ -1,5 +1,116 @@
1
+ =begin
1
2
  require_relative '../lib/treat'
2
3
 
3
- c = Collection (Treat.spec + 'samples/mathematicians')
4
- c.do :chunk, :segment, :tokenize, :tf_idf, :keywords
5
- c.visualize :dot, :file => 'test2.dot', :remove_types => [:paragraph]
4
+ def extract(sentences, n)
5
+ sentences.to_a.values_at(
6
+ *sentences.values
7
+ .each_with_index
8
+ .sort.reverse
9
+ .map(&:last)
10
+ .sort.take(n))
11
+ .map(&:first)
12
+ end
13
+
14
+ describe "#summarize" do
15
+
16
+ it "provides a summary of the text" do
17
+
18
+ Treat.debug = true
19
+ Treat.silence = true
20
+
21
+
22
+ context = Treat::DataSet.open('economist-context.yml')
23
+ content = Treat::DataSet.open('economist-content.yml')
24
+
25
+ c = Collection (Treat.spec + 'economist')
26
+ c.do :chunk, :segment, :tokenize, :tag, :name_tag, :tf_idf, :keywords
27
+
28
+ c.each_document do |d|
29
+
30
+ sentences = {}
31
+
32
+ d.each_sentence do |sentence|
33
+ cx = sentence.classify(:training => context)
34
+ ct = sentence.classify(:training => content)
35
+ sentences[sentence] = cx[1] + ct[1]
36
+ end
37
+
38
+ puts
39
+ puts d.titles[0].to_s
40
+ puts
41
+
42
+ puts extract(sentences, 5).map { |p| p.to_s }.join(' ')
43
+
44
+ end
45
+
46
+ c.serialize file: 'economist-coll.yaml'
47
+
48
+ c = Collection (Treat.spec + 'economist')
49
+ c.do :chunk, :segment, :tokenize, :tag, :name_tag, :tf_idf, :keywords
50
+
51
+ # Topic word count ? Synonyms of keywords ?
52
+ # Time expressions?
53
+ classify_content = Treat::Classification.new(
54
+ :phrase,
55
+ [:word_count, :number_count,
56
+ :keyword_count, :name_tag_count],
57
+ :has_key_content?
58
+ )
59
+
60
+ classify_context = Treat::Classification.new(
61
+ :phrase,
62
+ [:position,
63
+ :position_from_end,
64
+ :type_of_parent_zone,
65
+ :value_of_first_word,
66
+ :tag_of_first_word
67
+ ],
68
+ :has_key_context?,
69
+ false,
70
+ :discrete
71
+ )
72
+
73
+ c.each_sentence do |s|
74
+ puts s.to_s
75
+ a = STDIN.gets.to_s.strip
76
+ if a == ''
77
+ s.set :has_key_content?, false
78
+ s.set :has_key_context?, false
79
+ else
80
+ s.set :has_key_content?, true
81
+ s.set :has_key_context?, true
82
+ end
83
+ end
84
+
85
+ context = c.export(classify_context)
86
+ content = c.export(classify_content)
87
+
88
+ context.save('economist-context.yml')
89
+ content.save('economist-content.yml')
90
+
91
+ context = Treat::DataSet.open('economist-context.yml')
92
+ content = Treat::DataSet.open('economist-content.yml')
93
+
94
+ c.each_document do |d|
95
+
96
+ sentences = {}
97
+
98
+ d.each_sentence do |sentence|
99
+ cx = sentence.classify(:training => context)
100
+ ct = sentence.classify(:training => content)
101
+ sentences[sentence] = cx[1] + ct[1]
102
+ end
103
+
104
+ puts
105
+ puts d.titles[0].to_s
106
+ puts
107
+
108
+ puts extract(sentences, 5).map { |p| p.to_s }.join(' ')
109
+
110
+ end
111
+
112
+
113
+ end
114
+
115
+ end
116
+ =end
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: treat
3
3
  version: !ruby/object:Gem::Version
4
- version: 1.0.0
4
+ version: 1.0.1
5
5
  prerelease:
6
6
  platform: ruby
7
7
  authors:
@@ -9,7 +9,7 @@ authors:
9
9
  autorequire:
10
10
  bindir: bin
11
11
  cert_chain: []
12
- date: 2012-04-05 00:00:00.000000000 Z
12
+ date: 2012-04-07 00:00:00.000000000 Z
13
13
  dependencies:
14
14
  - !ruby/object:Gem::Dependency
15
15
  name: rubyzip