treat 1.0.0 → 1.0.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
data/README.md CHANGED
@@ -19,7 +19,7 @@ Treat is a toolkit for natural language processing and computational linguistics
19
19
 
20
20
  **Resources**
21
21
 
22
- * Read the [latest documentation](http://rubydoc.info/github/louismullie/treat/master/frames).
22
+ * Read the [latest documentation](http://rubydoc.info/github/louismullie/treat/frames).
23
23
  * See how to [install Treat](https://github.com/louismullie/treat/wiki/Installing-Treat).
24
24
  * Learn how to [use Treat](https://github.com/louismullie/treat/wiki/Using-Treat).
25
25
  * Help out by [contributing to the project](https://github.com/louismullie/treat/wiki/Contributing-to-Treat).
@@ -10,7 +10,7 @@ module Treat
10
10
  end
11
11
 
12
12
  # The current version of Treat.
13
- VERSION = "1.0.0"
13
+ VERSION = "1.0.1"
14
14
 
15
15
  # Add methods to handle syntactic sugar,
16
16
  # language configuration options, and paths.
@@ -5,22 +5,24 @@ class Treat::AI::Classifiers::ID3
5
5
  @@classifiers = {}
6
6
 
7
7
  def self.classify(entity, options = {})
8
-
8
+
9
9
  set = options[:training]
10
10
  cl = set.classification
11
11
 
12
12
  if !@@classifiers[cl]
13
13
  dec_tree = DecisionTree::ID3Tree.new(
14
- set.labels, set.items,
15
- cl.default, :continuous)
14
+ set.labels.map { |l| l.to_s }, set.items,
15
+ cl.default, cl.mode)
16
16
  dec_tree.train
17
17
  else
18
18
  dec_tree = @@classifiers[cl]
19
19
  end
20
20
 
21
+ cl.export_item(entity, false).inspect
22
+
21
23
  dec_tree.predict(
22
24
  cl.export_item(entity, false)
23
- )[0]
25
+ )
24
26
 
25
27
  end
26
28
 
@@ -4,18 +4,19 @@ class Treat::Classification
4
4
  attr_reader :features
5
5
  attr_reader :question
6
6
  attr_reader :labels
7
+ attr_reader :mode
7
8
  attr_reader :default
8
9
 
9
- def initialize(type_or_types, feature_or_features, question, default = false)
10
-
10
+ def initialize(type_or_types, feature_or_features,
11
+ question, default = false, mode = :continuous)
11
12
  @types, @features,
12
13
  @question, @default =
13
14
  [*type_or_types],
14
15
  [*feature_or_features],
15
16
  question, default
16
17
 
18
+ @mode = mode
17
19
  @labels = []
18
-
19
20
  @features.each do |cmd|
20
21
  if cmd.is_a?(Array)
21
22
  @labels << cmd[0]
@@ -23,11 +24,9 @@ class Treat::Classification
23
24
  @labels << cmd
24
25
  end
25
26
  end
26
-
27
27
  end
28
28
 
29
29
  def export_item(e, include_question = true)
30
-
31
30
  line = []
32
31
 
33
32
  @features.each do |cmd|
@@ -46,16 +45,16 @@ class Treat::Classification
46
45
  end
47
46
  end
48
47
 
49
- begin
50
- if include_question
51
- line << e.send(@question)
48
+ if include_question
49
+ if e.has?(@question)
50
+ line << e.get(@question)
51
+ else
52
+ line << @default
52
53
  end
53
- rescue Treat::Exception
54
- line << @default
55
54
  end
55
+
56
56
  line[-1] = '' if line[-1].nil?
57
57
  line
58
-
59
58
  end
60
59
 
61
60
  end
@@ -95,7 +95,7 @@ module Treat::Entities::Abilities::Buildable
95
95
 
96
96
  f = Treat::Downloader.download(
97
97
  uri.scheme, uri.host, path, file)
98
- options[:_default_format] = :html
98
+ options[:default_to] ||= :html
99
99
 
100
100
  e = from_file(f, options)
101
101
  e.set :url, url
@@ -170,9 +170,8 @@ module Treat::Entities::Abilities::Buildable
170
170
  "point to a readable file."
171
171
  end
172
172
 
173
- dflt = options[:_default_format]
174
173
  fmt = Treat::Formatters::Readers::Autoselect.
175
- detect_format(file, dflt)
174
+ detect_format(file, options[:default_to])
176
175
  options[:_format] = fmt
177
176
 
178
177
  if fmt == :yaml || fmt == :yml ||
@@ -3,27 +3,29 @@ module Treat::Entities::Abilities::Countable
3
3
  # Find the position of the current entity
4
4
  # inside the parent entity, starting at 1.
5
5
  def position
6
-
7
6
  unless has_parent?
8
7
  raise Treat::Exception,
9
8
  "No parent to get position in."
10
9
  end
11
-
12
10
  parent.children.index(self) + 1
13
-
14
11
  end
15
12
 
13
+ # Find the position of this entity from
14
+ # the end of the parent entity.
15
+ def position_from_end
16
+ p = position
17
+ parent.size - p
18
+ end
19
+
16
20
  # Find the frequency of the entity in
17
21
  # the supplied parent or in the root
18
22
  # node if nil.
19
23
  def frequency_in(parent_type = nil)
20
-
21
24
  unless parent_type
22
25
  root.registry[:value][id]
23
26
  end
24
27
 
25
28
  registry(parent_type)[:value][value]
26
-
27
29
  end
28
30
 
29
31
  # Get the frequency of this entity's
@@ -99,6 +99,15 @@ module Treat::Entities::Abilities::Iterable
99
99
 
100
100
  alias :ancestors_with_type :ancestors_with_types
101
101
 
102
+ # Number of children that have a given feature.
103
+ def num_children_with_feature(feature)
104
+ i = 0
105
+ each do |c|
106
+ i += 1 if c.has?(feature)
107
+ end
108
+ i
109
+ end
110
+
102
111
  # Return the first element in the array, warning if not
103
112
  # the only one in the array. Used for magic methods: e.g.,
104
113
  # the magic method "word" if called on a sentence with many
@@ -35,6 +35,9 @@ module Treat::Entities::Abilities::Magical
35
35
  entities_with_type($1.intern)
36
36
  elsif method =~ /^#{@@entities_regexp}$/
37
37
  first_but_warn(entities_with_type($1.intern), $1)
38
+ elsif method =~ /^first_#{@@entities_regexp}$/
39
+ e = entities_with_type($1.intern)
40
+ e ? e[0] : nil
38
41
  elsif method =~ /^parent_#{@@entities_regexp}$/
39
42
  ancestor_with_type($1.intern)
40
43
  elsif method =~ /^each_#{@@entities_regexp}$/
@@ -59,21 +62,23 @@ module Treat::Entities::Abilities::Magical
59
62
  entities_with_category($1.intern)
60
63
  elsif method =~ /^#{@@cats_regexp}$/
61
64
  first_but_warn(entities_with_category($1.intern), $1)
65
+ elsif method =~ /^first_#{@@cats_regexp}$/
66
+ e = entities_with_category($1.intern)
67
+ e ? e[0] : nil
62
68
  elsif method =~ /^#{@@cats_regexp}_count$/
63
69
  entities_with_category($1.intern).size
70
+ elsif method =~ /^(.*)_count$/
71
+ num_children_with_feature($1.intern)
64
72
  elsif method =~ /^#{@@cats_regexp}s_with_([a-z]*)$/
65
73
  entities_with_feature($2.intern, args[0], $1)
66
74
  elsif method =~ /^#{@@cats_regexp}_with_([a-z]*)$/
67
75
  first_but_warn(entities_with_feature(
68
76
  $2.intern, args[0], $1.intern), $1)
69
- elsif method =~ /^([a-z]*)_of_first_#{@@entities_regexp}$/
70
- f = send(:"#{$2}s".intern).first
77
+ elsif method =~ /^([a-z]*)_of_(.*)$/
78
+ f = send($2.intern)
71
79
  f ? f.send($1.intern) : nil
72
80
  elsif method =~ /^frequency_in_#{@@entities_regexp}$/
73
81
  frequency_in($1.intern)
74
- # first_word
75
- # tag_of_first_verb
76
- # tag_of_title
77
82
  else
78
83
  return :no_magic
79
84
  end
@@ -13,9 +13,9 @@ class Treat::Extractors::Keywords::TfIdf
13
13
 
14
14
  options = DefaultOptions.merge(options)
15
15
  tf_idfs = {}
16
+
16
17
  entity.each_word do |word|
17
- word.check_has(:tf_idf, false)
18
- tf_idfs[word] ||= word.get(:tf_idf)
18
+ tf_idfs[word] ||= word.tf_idf
19
19
  end
20
20
 
21
21
  tf_idfs = tf_idfs.
@@ -32,29 +32,32 @@ class Treat::Extractors::Keywords::TfIdf
32
32
 
33
33
  w = word[0].to_s
34
34
  next if keywords.include?(w)
35
-
36
- entity.each_word_with_value(w) do |w2|
37
-
38
- ps = w2.parent_phrase
39
-
40
- if ps.has?(:keyword_count)
41
- ps.set :keyword_count,
42
- ps.keyword_count + 1
43
- else
44
- ps.set :keyword_count, 1
45
- end
46
- ps.set :keyword_density,
47
- (ps.keyword_count / ps.size)
48
-
49
- end
50
-
51
35
  break if i > options[:number]
52
36
  keywords << w
53
37
 
54
38
  i += 1
55
39
  end
56
40
 
41
+ entity.each_word do |word|
42
+
43
+ if keywords.include?(word.to_s)
44
+ word.set :is_keyword?, true
45
+ pp = entity.parent_phrase
46
+ next unless pp
47
+ if pp.has? :keyword_count
48
+ pp.set :keyword_count,
49
+ pp.keyword_count + 1
50
+ else
51
+ pp.set :keyword_count, 1
52
+ end
53
+ else
54
+ word.set :is_keyword?, false
55
+ end
56
+
57
+ end
58
+
57
59
  keywords
60
+
58
61
  end
59
62
 
60
63
  end
@@ -19,7 +19,7 @@ class Treat::Loaders
19
19
  StanfordCoreNLP.log_file =
20
20
  NULL_DEVICE if Treat.silence
21
21
 
22
- StanfordCoreNLP.init
22
+ StanfordCoreNLP.bind
23
23
  @@loaded = true
24
24
 
25
25
  end
@@ -36,11 +36,24 @@ describe Treat::Entities::Document do
36
36
  "a document with the contents of the file" do
37
37
  url = 'http://www.rubyinside.com/nethttp-cheat-sheet-2940.html'
38
38
  d = Treat::Entities::Document.build(url)
39
+ d.format.should eql :html
40
+ d.print_tree
39
41
  d.should be_an_instance_of Treat::Entities::Document
40
42
  d.to_s.index('Rubyist').should_not eql nil
41
43
  end
42
44
  end
43
45
 
46
+ context "when supplied with a url with no file extension" do
47
+ it "downloads the file the URL points to and opens " +
48
+ "a document with the contents of the file, assuming " +
49
+ "the downloaded file to be in HTML format" do
50
+ url = 'http://www.economist.com/node/21552208'
51
+ d = Treat::Entities::Document.build(url)
52
+ d.should be_an_instance_of Treat::Entities::Document
53
+ d.to_s.index('Ronnie Lupe').should_not eql nil
54
+ end
55
+ end
56
+
44
57
  context "when called with anything else than a " +
45
58
  "readable file name or url" do
46
59
 
@@ -1,5 +1,116 @@
1
+ =begin
1
2
  require_relative '../lib/treat'
2
3
 
3
- c = Collection (Treat.spec + 'samples/mathematicians')
4
- c.do :chunk, :segment, :tokenize, :tf_idf, :keywords
5
- c.visualize :dot, :file => 'test2.dot', :remove_types => [:paragraph]
4
+ def extract(sentences, n)
5
+ sentences.to_a.values_at(
6
+ *sentences.values
7
+ .each_with_index
8
+ .sort.reverse
9
+ .map(&:last)
10
+ .sort.take(n))
11
+ .map(&:first)
12
+ end
13
+
14
+ describe "#summarize" do
15
+
16
+ it "provides a summary of the text" do
17
+
18
+ Treat.debug = true
19
+ Treat.silence = true
20
+
21
+
22
+ context = Treat::DataSet.open('economist-context.yml')
23
+ content = Treat::DataSet.open('economist-content.yml')
24
+
25
+ c = Collection (Treat.spec + 'economist')
26
+ c.do :chunk, :segment, :tokenize, :tag, :name_tag, :tf_idf, :keywords
27
+
28
+ c.each_document do |d|
29
+
30
+ sentences = {}
31
+
32
+ d.each_sentence do |sentence|
33
+ cx = sentence.classify(:training => context)
34
+ ct = sentence.classify(:training => content)
35
+ sentences[sentence] = cx[1] + ct[1]
36
+ end
37
+
38
+ puts
39
+ puts d.titles[0].to_s
40
+ puts
41
+
42
+ puts extract(sentences, 5).map { |p| p.to_s }.join(' ')
43
+
44
+ end
45
+
46
+ c.serialize file: 'economist-coll.yaml'
47
+
48
+ c = Collection (Treat.spec + 'economist')
49
+ c.do :chunk, :segment, :tokenize, :tag, :name_tag, :tf_idf, :keywords
50
+
51
+ # Topic word count ? Synonyms of keywords ?
52
+ # Time expressions?
53
+ classify_content = Treat::Classification.new(
54
+ :phrase,
55
+ [:word_count, :number_count,
56
+ :keyword_count, :name_tag_count],
57
+ :has_key_content?
58
+ )
59
+
60
+ classify_context = Treat::Classification.new(
61
+ :phrase,
62
+ [:position,
63
+ :position_from_end,
64
+ :type_of_parent_zone,
65
+ :value_of_first_word,
66
+ :tag_of_first_word
67
+ ],
68
+ :has_key_context?,
69
+ false,
70
+ :discrete
71
+ )
72
+
73
+ c.each_sentence do |s|
74
+ puts s.to_s
75
+ a = STDIN.gets.to_s.strip
76
+ if a == ''
77
+ s.set :has_key_content?, false
78
+ s.set :has_key_context?, false
79
+ else
80
+ s.set :has_key_content?, true
81
+ s.set :has_key_context?, true
82
+ end
83
+ end
84
+
85
+ context = c.export(classify_context)
86
+ content = c.export(classify_content)
87
+
88
+ context.save('economist-context.yml')
89
+ content.save('economist-content.yml')
90
+
91
+ context = Treat::DataSet.open('economist-context.yml')
92
+ content = Treat::DataSet.open('economist-content.yml')
93
+
94
+ c.each_document do |d|
95
+
96
+ sentences = {}
97
+
98
+ d.each_sentence do |sentence|
99
+ cx = sentence.classify(:training => context)
100
+ ct = sentence.classify(:training => content)
101
+ sentences[sentence] = cx[1] + ct[1]
102
+ end
103
+
104
+ puts
105
+ puts d.titles[0].to_s
106
+ puts
107
+
108
+ puts extract(sentences, 5).map { |p| p.to_s }.join(' ')
109
+
110
+ end
111
+
112
+
113
+ end
114
+
115
+ end
116
+ =end
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: treat
3
3
  version: !ruby/object:Gem::Version
4
- version: 1.0.0
4
+ version: 1.0.1
5
5
  prerelease:
6
6
  platform: ruby
7
7
  authors:
@@ -9,7 +9,7 @@ authors:
9
9
  autorequire:
10
10
  bindir: bin
11
11
  cert_chain: []
12
- date: 2012-04-05 00:00:00.000000000 Z
12
+ date: 2012-04-07 00:00:00.000000000 Z
13
13
  dependencies:
14
14
  - !ruby/object:Gem::Dependency
15
15
  name: rubyzip