treat 0.1.2 → 0.1.3

Sign up to get free protection for your applications and to get access to all the features.
Files changed (100) hide show
  1. data/LICENSE +7 -8
  2. data/TODO +16 -13
  3. data/examples/keywords.rb +89 -1
  4. data/lib/treat/buildable.rb +1 -8
  5. data/lib/treat/categories.rb +3 -4
  6. data/lib/treat/category.rb +1 -1
  7. data/lib/treat/delegatable.rb +1 -1
  8. data/lib/treat/detectors/encoding/native.rb +5 -0
  9. data/lib/treat/detectors/encoding/r_chardet19.rb +2 -3
  10. data/lib/treat/detectors/language/language_detector.rb +4 -0
  11. data/lib/treat/detectors/language/what_language.rb +4 -4
  12. data/lib/treat/detectors.rb +1 -1
  13. data/lib/treat/entities/entity.rb +5 -3
  14. data/lib/treat/entities/tokens.rb +14 -5
  15. data/lib/treat/entities/zones.rb +4 -0
  16. data/lib/treat/entities.rb +7 -5
  17. data/lib/treat/extractors/keywords/topics_frequency.rb +40 -0
  18. data/lib/treat/extractors/statistics/{frequency.rb → frequency_in.rb} +5 -4
  19. data/lib/treat/extractors/statistics/frequency_of.rb +3 -5
  20. data/lib/treat/extractors/statistics/{position_in.rb → position_in_parent.rb} +4 -3
  21. data/lib/treat/extractors/statistics/tf_idf.rb +36 -0
  22. data/lib/treat/extractors/statistics/transition_matrix.rb +25 -25
  23. data/lib/treat/extractors/statistics/transition_probability.rb +8 -4
  24. data/lib/treat/extractors/time/chronic.rb +8 -0
  25. data/lib/treat/extractors/time/native.rb +6 -0
  26. data/lib/treat/extractors/time/nickel.rb +31 -23
  27. data/lib/treat/extractors/topic_words/lda.rb +21 -16
  28. data/lib/treat/extractors/topics/reuters.rb +6 -4
  29. data/lib/treat/extractors.rb +7 -7
  30. data/lib/treat/formatters/readers/abw.rb +32 -0
  31. data/lib/treat/formatters/readers/autoselect.rb +13 -11
  32. data/lib/treat/formatters/readers/doc.rb +13 -0
  33. data/lib/treat/formatters/readers/gocr.rb +2 -0
  34. data/lib/treat/formatters/readers/html.rb +21 -1
  35. data/lib/treat/formatters/readers/ocropus.rb +3 -3
  36. data/lib/treat/formatters/readers/odt.rb +41 -0
  37. data/lib/treat/formatters/readers/pdf.rb +5 -2
  38. data/lib/treat/formatters/readers/txt.rb +2 -0
  39. data/lib/treat/formatters/serializers/xml.rb +3 -2
  40. data/lib/treat/formatters/serializers/yaml.rb +2 -0
  41. data/lib/treat/formatters/unserializers/autoselect.rb +7 -1
  42. data/lib/treat/formatters/unserializers/xml.rb +6 -1
  43. data/lib/treat/formatters/unserializers/yaml.rb +5 -1
  44. data/lib/treat/formatters/visualizers/dot.rb +35 -37
  45. data/lib/treat/formatters/visualizers/html.rb +1 -0
  46. data/lib/treat/formatters/visualizers/inspect.rb +4 -0
  47. data/lib/treat/formatters/visualizers/short_value.rb +18 -3
  48. data/lib/treat/formatters/visualizers/standoff.rb +11 -6
  49. data/lib/treat/formatters/visualizers/tree.rb +5 -1
  50. data/lib/treat/formatters/visualizers/txt.rb +6 -1
  51. data/lib/treat/formatters.rb +1 -1
  52. data/lib/treat/group.rb +4 -3
  53. data/lib/treat/inflectors/cardinal_words/linguistics.rb +14 -17
  54. data/lib/treat/inflectors/conjugations/linguistics.rb +16 -3
  55. data/lib/treat/inflectors/declensions/linguistics.rb +17 -6
  56. data/lib/treat/inflectors/ordinal_words/linguistics.rb +9 -10
  57. data/lib/treat/inflectors/stem/porter.rb +6 -2
  58. data/lib/treat/inflectors/stem/porter_c.rb +4 -1
  59. data/lib/treat/inflectors/stem/uea.rb +4 -4
  60. data/lib/treat/languages/english/tags.rb +16 -0
  61. data/lib/treat/languages/english.rb +4 -1
  62. data/lib/treat/lexicalizers/category/from_tag.rb +4 -4
  63. data/lib/treat/lexicalizers/linkages/naive.rb +3 -3
  64. data/lib/treat/lexicalizers/tag/brill.rb +3 -11
  65. data/lib/treat/lexicalizers/tag/lingua.rb +4 -6
  66. data/lib/treat/lexicalizers.rb +0 -2
  67. data/lib/treat/processors/chunkers/txt.rb +4 -4
  68. data/lib/treat/processors/parsers/enju.rb +3 -17
  69. data/lib/treat/processors/parsers/stanford.rb +4 -0
  70. data/lib/treat/processors/segmenters/punkt.rb +1 -0
  71. data/lib/treat/processors/segmenters/stanford.rb +4 -0
  72. data/lib/treat/processors/segmenters/tactful.rb +4 -1
  73. data/lib/treat/processors/tokenizers/punkt.rb +1 -2
  74. data/lib/treat/processors/tokenizers/stanford.rb +4 -0
  75. data/lib/treat/processors/tokenizers/tactful.rb +1 -1
  76. data/lib/treat/processors.rb +4 -4
  77. data/lib/treat/proxies.rb +18 -11
  78. data/lib/treat/registrable.rb +12 -5
  79. data/lib/treat/sugar.rb +8 -3
  80. data/lib/treat/tree.rb +10 -3
  81. data/lib/treat.rb +55 -55
  82. data/test/tc_entity.rb +7 -7
  83. data/test/tc_extractors.rb +6 -4
  84. data/test/tc_formatters.rb +0 -4
  85. data/test/tests.rb +2 -0
  86. data/test/texts.rb +4 -4
  87. metadata +48 -56
  88. data/examples/texts/bugged_out.txt +0 -26
  89. data/examples/texts/half_cocked_basel.txt +0 -16
  90. data/examples/texts/hedge_funds.txt +0 -24
  91. data/examples/texts/hose_and_dry.txt +0 -19
  92. data/examples/texts/hungarys_troubles.txt +0 -46
  93. data/examples/texts/indias_slowdown.txt +0 -15
  94. data/examples/texts/merkozy_rides_again.txt +0 -24
  95. data/examples/texts/prada_is_not_walmart.txt +0 -9
  96. data/examples/texts/republican_nomination.txt +0 -26
  97. data/examples/texts/to_infinity_and_beyond.txt +0 -15
  98. data/lib/treat/entities/text.rb +0 -7
  99. data/lib/treat/extractors/key_sentences/topics_frequency.rb +0 -49
  100. data/lib/treat/formatters/cleaners/html.rb +0 -17
data/LICENSE CHANGED
@@ -18,11 +18,10 @@ Author: Louis-Antoine Mullie (louis.mullie@gmail.com). Copyright 2012.
18
18
  Non-trivial amount of code has been incorporated and modified from
19
19
  other libraries, specifically for the following files:
20
20
 
21
- - processors/tokenizers/macintyre.rb - Utiyama Masao (Ruby License)
22
- - processors/tokenizers/tactful.rb - Matthew Bunday (GPL license)
23
- - Inflectors/lemmatizers/e_lemma.rb - Utiyama Masao (GPL license)
24
- - processors/tokenizers/perl.rb - Todd A. Fisher (MIT License)
25
- - processors/tokenizers/punkt.rb - Steven Bird Edward Loper and Joel Nothman (Apache 2.0 license),
26
- - extractors/topics/reuters.rb - Mark Watson (GPL license)
27
- - inflectors/stemmers/porter.rb - Ray Pereda (No license information)
28
- - tree.rb - Partyl based on work by
21
+ - processors/tokenizers/macintyre.rb - Utiyama Masao (Ruby License)
22
+ - processors/tokenizers/tactful.rb - Matthew Bunday (GPL license)
23
+ - processors/tokenizers/perl.rb - Todd A. Fisher (MIT License)
24
+ - processors/tokenizers/punkt.rb - Steven Bird Edward Loper and Joel Nothman (Apache 2.0 license)
25
+ - extractors/topics/reuters.rb - Mark Watson (GPL license)
26
+ - inflectors/stemmers/porter.rb - Ray Pereda (No license information)
27
+ - tree.rb - Partly based on work by Anupam Sengupta (Creative Commons Attribution-ShareAlike Unported v. 3.0)
data/TODO CHANGED
@@ -1,25 +1,26 @@
1
1
  ## Urgent
2
2
 
3
- - Linkers
3
+ - Linkers + documentation
4
4
  - Check taggers for context
5
5
  - Stanford dependencies parse
6
- - Enju: test
7
6
  - Ocropus => use better function
8
7
  - Optimize magic methods... is_token? type methods, phrase categories.
9
- - Move statistics?
8
+ - Move statistics
10
9
  - Synset class move
11
10
  - general procedure for options, check that user doesn't want to change options...
12
- - Languages: dependencies vs. edges, PTB function tags
13
- - Check for # Fix everywhere
14
- - Check paths; parse bin paths
15
- - Ferret, Natural Inputs
16
- - Use consistently delegate
17
- - Text becomes section
18
- - Remove top level
11
+ - Ferret, Spider
19
12
  - Loading multiple JARs
13
+ - Linguistics loader, stanford loader
20
14
  - Tokenized sentences are not parsed
21
- - Documentation
22
- - Remove feature
15
+ - Dot colors
16
+ - Fix encoders
17
+ - Fix Punkt segmenter training text.
18
+ - Mark Watson's text extractor
19
+ - Statistics position in
20
+ - Fix documentation antiword, Graphviz, # encoding: utf-8
21
+ - Shortcut methods.. pre/postprocessors
22
+ - Only Phrase..
23
+ - Frequency in
23
24
 
24
25
  ## Eventually
25
26
 
@@ -52,4 +53,6 @@
52
53
  - String type detector for other languages
53
54
  - Automatic benchmark
54
55
  - Raspell spell checker
55
- - Multithreading
56
+ - Multithreading
57
+ - Mark Watson's Java NLP utility to identify proper nouns (human names and places) in text
58
+ - FastTag a Java fast part of speech tagger.
data/examples/keywords.rb CHANGED
@@ -57,4 +57,92 @@ c.each_document do |d|
57
57
  end
58
58
  end
59
59
 
60
- end
60
+ end
61
+
62
+
63
+
64
+ Treat.edulcorate
65
+ Treat.bin = '/ruby/nat/bin'
66
+
67
+ c = Collection 'economist'
68
+ c.each_document { |doc| doc.chunk.segment.tokenize }
69
+
70
+ topic_words = c.topic_words(
71
+ :lda,
72
+ :topics => 5,
73
+ :words_per_topic => 5,
74
+ :iterations => 20
75
+ )
76
+
77
+ keywords = c.keywords(
78
+ :topics_frequency,
79
+ :topic_words => topic_words,
80
+ :tf_idf_threshold => 180
81
+ )
82
+
83
+ puts keywords.inspect
84
+
85
+ abort
86
+
87
+ c = Phrase 'a test clause'
88
+ c.parse
89
+ puts c.visualize(:tree)
90
+ puts c.visualize(:inspect)
91
+ puts c.visualize(:short_value)
92
+ puts c.visualize(:standoff)
93
+ puts c.visualize(:tree)
94
+
95
+ c.serialize(:yaml).save('test.yml')
96
+ c.serialize(:xml).save('test.xml')
97
+
98
+ d = Phrase 'test.yml'
99
+ d.print_tree
100
+ d = Phrase 'test.xml'
101
+ d.print_tree
102
+
103
+ puts d.words[0].position_in_parent
104
+ abort
105
+
106
+ w = Word 'running'
107
+ puts w.stem(:porter_c)
108
+ puts w.stem(:porter)
109
+ puts w.stem(:uea)
110
+
111
+ w = Word 'run'
112
+
113
+ puts w.infinitive(:linguistics)
114
+ puts w.present_participle(:linguistics)
115
+ puts w.plural(:linguistics)
116
+
117
+ w = Word 'table'
118
+
119
+ puts w.synonyms.inspect
120
+ puts w.antonyms.inspect
121
+ puts w.hyponyms.inspect
122
+ puts w.hypernyms.inspect
123
+
124
+ n = Number 2
125
+ puts n.ordinal_words(:linguistics)
126
+ puts n.cardinal_words(:linguistics)
127
+
128
+ s = Sentence 'A sentence to parse.'
129
+ s.dup.parse(:enju).print_tree
130
+ s.dup.parse(:stanford).print_tree
131
+
132
+ s = Sentence 'A sentence to tokenize'
133
+ s.dup.tokenize(:macintyre).print_tree
134
+ s.dup.tokenize(:multilingual).print_tree
135
+ s.dup.tokenize(:perl).print_tree
136
+ s.dup.tokenize(:punkt).print_tree
137
+ s.dup.tokenize(:stanford).print_tree
138
+ s.dup.tokenize(:tactful).print_tree
139
+
140
+
141
+ =begin
142
+ c = Collection 'economist'
143
+ # c.each_document { |d| d.chunk.segment.tokenize }
144
+ c.documents[0].chunk.segment
145
+ c.sentences[0].parse(:enju)
146
+ c.each_word { |word| word.stem }
147
+ c.visualize(:dot, features: [:tag]).save('test.dot')
148
+ =end
@@ -24,9 +24,8 @@ module Treat
24
24
  "Cannot create a document or collection from " +
25
25
  "a string (need a readable file/folder)."
26
26
  end
27
- string = string.to_s
28
27
  dot = string.count('.') + string.count('!') + string.count('?')
29
- return Treat::Entities::Text.new(string) if dot > 1 ||
28
+ return Treat::Entities::Section.new(string) if dot > 1 ||
30
29
  (string.count("\n") > 0 && dot == 1)
31
30
  return Treat::Entities::Sentence.new(string) if dot == 1 && string.size > 5
32
31
  if string.count(' ') == 0
@@ -99,12 +98,6 @@ module Treat
99
98
  d.read
100
99
  end
101
100
  def from_serialized_file(file)
102
- unless [Treat::Entities::Document,
103
- Treat::Entities::Collection].include?(self)
104
- raise Treat::Exception,
105
- "Cannot create something else than a " +
106
- "document from raw file '#{file}'."
107
- end
108
101
  d = Treat::Entities::Document.new(file)
109
102
  d.unserialize
110
103
  d.children[0].set_as_root!
@@ -1,17 +1,16 @@
1
1
  module Treat
2
2
  # This module keeps track of all categories that
3
- # exist and the methods they implement, and is
4
- # responsible for including the categories.
3
+ # exist and the methods they implement.
5
4
  module Categories
6
- # A list of categories.
7
5
  class << self; attr_accessor :list; end
6
+ # Array - list of all categories.
8
7
  self.list = []
9
8
  # Boolean - does any of the categories have
10
9
  # a method that corresponds to sym?
11
10
  def self.have_method?(sym); methods.include?(sym); end
12
11
  # Cache the list of methods once it has been computed.
13
12
  @@methods = []
14
- # Provide a list of all methods implemented
13
+ # Array - provide a list of all methods implemented
15
14
  # by all Treat categories.
16
15
  def self.methods
17
16
  return @@methods unless @@methods.empty?
@@ -12,7 +12,7 @@ module Treat
12
12
  groups.each do |group|
13
13
  group = const_get(group)
14
14
  group.targets.each do |entity_type|
15
- entity = Entities.const_get(cc(entity_type))
15
+ entity = Treat::Entities.const_get(cc(entity_type))
16
16
  entity.class_eval { add_delegators group }
17
17
  end
18
18
  end
@@ -46,7 +46,7 @@ module Treat
46
46
  delegate_klass = group.const_get(:"#{cc(delegate.to_s)}")
47
47
  result = entity.accept(group, delegate_klass, m, options)
48
48
  if decorator
49
- result = group.send(decorator, self, result)
49
+ result = group.send(decorator, entity, result)
50
50
  end
51
51
  if group.type == :annotator
52
52
  f = decorator.nil? ? m : decorator
@@ -1,7 +1,12 @@
1
1
  module Treat
2
2
  module Detectors
3
3
  module Encoding
4
+ # A wrapper class for Ruby's native encoding detector.
4
5
  class Native
6
+ # Return the encoding of the entity according
7
+ # to the Ruby interpreter.
8
+ #
9
+ # Options: none.
5
10
  def self.encoding(entity, options={})
6
11
  entity.value.encoding.name.
7
12
  gsub('-', '_').downcase.intern
@@ -6,9 +6,8 @@ module Treat
6
6
  # A wrapper for the 'rchardet19' gem, which
7
7
  # detects the encoding of a file.
8
8
  class RChardet19
9
- # Returns an Encoding object representing
10
- # the encoding of the supplied entity's
11
- # text value.
9
+ # Returns the encoding of the entity according
10
+ # to the 'rchardet19' gem.
12
11
  #
13
12
  # Options: none.
14
13
  def self.encoding(entity, options={})
@@ -1,6 +1,10 @@
1
1
  module Treat
2
2
  module Detectors
3
3
  module Language
4
+ # A generic language detector, which is called before
5
+ # any language detector and ensures that configuration
6
+ # options concerning language are enforced (e.g. returns
7
+ # the default language when Treat.detect_language is false).
4
8
  class LanguageDetector
5
9
  def self.language(entity, options = {})
6
10
  if Treat.detect_language == false
@@ -7,7 +7,7 @@ module Treat
7
7
  # performs probabilistic language detection.
8
8
  class WhatLanguage < LanguageDetector
9
9
  # Keep only once instance of the gem class.
10
- @@wl = nil
10
+ @@detector = nil
11
11
  # Detect the language of an entity using the
12
12
  # 'whatlanguage' gem. Return an identifier
13
13
  # corresponding to the ISO-639-2 code for the
@@ -15,10 +15,10 @@ module Treat
15
15
  def self.language(entity, options = {})
16
16
  predetection = super(entity, options)
17
17
  return predetection if predetection
18
- @@wl ||= ::WhatLanguage.new(:all)
19
- all = @@wl.process_text(entity.to_s)
18
+ @@detector ||= ::WhatLanguage.new(:possibilities)
19
+ possibilities = @@detector.process_text(entity.to_s)
20
20
  lang = {}
21
- all.each do |k,v|
21
+ possibilities.each do |k,v|
22
22
  lang[Treat::Languages.find(k)] = v
23
23
  end
24
24
  Treat::Feature.new(lang).best
@@ -2,7 +2,7 @@ module Treat
2
2
  # Detectors detect a specific meta-information about
3
3
  # an entity, such as encoding, format and language.
4
4
  #
5
- # Detectors are language-independent, and thus they
5
+ # Detectors are language-independent, and thus there
6
6
  # are default algorithms specified for each of them.
7
7
  module Detectors
8
8
  # Group for algorithms that detect encoding.
@@ -43,7 +43,7 @@ module Treat
43
43
  # feature does not exist
44
44
  def method_missing(sym, *args, &block)
45
45
  return self.build(*args) if sym == nil
46
- if !@features[sym]
46
+ if !@features.has_key?(sym)
47
47
  r = parse_magic_method(sym, *args, &block)
48
48
  if r == :no_magic
49
49
  begin
@@ -168,7 +168,10 @@ module Treat
168
168
  def <<(entities, clear_parent = true)
169
169
  entities = [entities] unless entities.is_a? Array
170
170
  entities.each do |entity|
171
- register_token(entity) if entity.is_leaf?
171
+ if entity.is_a?(Treat::Entities::Token) ||
172
+ entity.is_a?(Treat::Entities::Constituent)
173
+ register_token(entity) unless entity.value == ''
174
+ end
172
175
  end
173
176
  super(entities)
174
177
  @parent.value = '' if has_parent?
@@ -211,7 +214,6 @@ module Treat
211
214
  def short_value(ml = 6); visualize(:short_value, :max_length => ml); end
212
215
  # Convenience functions. Convenience decorators.
213
216
  def frequency_of(word); statistics(:frequency_of, value: word); end
214
-
215
217
  private
216
218
  # Return the first element in the array, warning if not
217
219
  # the only one in the array. Used for magic methods: e.g.,
@@ -4,15 +4,24 @@ module Treat
4
4
  class Token < Entity
5
5
  # All tokens are leafs.
6
6
  def is_leaf?; true; end
7
- def frequency; self.set :frequency, statistics(:frequency); end
7
+ # Convenience function for statistics.
8
+ def frequency; statistics(:frequency_in); end
9
+ def frequency_in(type); statistics(:frequency_in, type: type); end
10
+ def position_in(type); statistics(:position_in_parent); end
11
+ def tf_idf; statistics(:tf_idf); end
8
12
  end
9
13
  # Represents a word.
10
14
  class Word < Token
11
- def infinitive(conjugator = nil); conjugate(conjugator, :mode => :infinitive); end
12
- def present_participle(conjugator = nil); conjugate(conjugator, :tense => :present, :mode => :participle); end
13
- def plural(declensor = nil); declense(declensor, :count => :plural); end
14
- def singular(declensor = nil); declense(declensor, :count => :singular); end
15
+ # Convenience function for conjugations.
16
+ def infinitive(conjugator = nil); conjugations(conjugator, :mode => :infinitive); end
17
+ # Convenience function for conjugations.
18
+ def present_participle(conjugator = nil); conjugations(conjugator, :tense => :present, :mode => :participle); end
19
+ # Convenience function for declensions.
20
+ def plural(declensor = nil); declensions(declensor, :count => :plural); end
21
+ # Convenience function for declensions.
22
+ def singular(declensor = nil); declensions(declensor, :count => :singular); end
15
23
  end
24
+ # Represents a clitic ('s).
16
25
  class Clitic < Token
17
26
  end
18
27
  # Represents a number.
@@ -13,5 +13,9 @@ module Treat
13
13
  # Represents a list.
14
14
  class List < Zone
15
15
  end
16
+ # Represents a section, usually with a title
17
+ # and at least one paragraph.
18
+ class Section < Zone
19
+ end
16
20
  end
17
21
  end
@@ -14,7 +14,6 @@ module Treat
14
14
  # Then require all possible entities.
15
15
  require 'treat/entities/collection'
16
16
  require 'treat/entities/document'
17
- require 'treat/entities/text'
18
17
  require 'treat/entities/zones'
19
18
  require 'treat/entities/sentence'
20
19
  require 'treat/entities/constituents'
@@ -25,9 +24,11 @@ module Treat
25
24
  const_get(entity).build(value, id)
26
25
  end
27
26
  end
27
+ # Cache a list of defined entity types to
28
+ # improve performance.
29
+ @@list = []
28
30
  # Provide a list of defined entity types,
29
31
  # as non-camel case identifiers.
30
- @@list = []
31
32
  def self.list
32
33
  return @@list unless @@list.empty?
33
34
  self.constants.each do |constant|
@@ -35,16 +36,17 @@ module Treat
35
36
  end
36
37
  @@list
37
38
  end
38
- # Return the 'z-order' for hierarchical
39
- # comparison of entity types.
39
+ # Return the hierarchy level of the entity
40
+ # class, the minimum being a Token and the
41
+ # maximum being a Collection.
40
42
  def self.rank(type)
41
43
  klass = Entities.const_get(cc(type))
42
44
  compare = lambda { |a,b| a == b || a < b }
43
45
  return 0 if compare.call(klass, Token)
44
46
  return 1 if compare.call(klass, Constituent)
45
47
  return 2 if compare.call(klass, Sentence)
48
+ return 3 if compare.call(klass, Zone)
46
49
  return 4 if compare.call(klass, Document)
47
- return 3 if compare.call(klass, Section)
48
50
  return 5 if compare.call(klass, Collection)
49
51
  end
50
52
  end
@@ -0,0 +1,40 @@
1
+ module Treat
2
+ module Extractors
3
+ module Keywords
4
+ class TopicsFrequency
5
+ DefaultOptions = {tf_idf_threshold: 180, topic_words: nil}
6
+ def self.keywords(entity, options = {})
7
+ options = DefaultOptions.merge(options)
8
+ unless options[:topic_words]
9
+ raise Treat::Exception, "You must supply topic words."
10
+ end
11
+ if Treat::Entities.rank(entity.type) <
12
+ Treat::Entities.rank(:sentence)
13
+ raise Treat::Exception, 'Cannot get the key ' +
14
+ 'sentences of an entity smaller than a sentence.'
15
+ else
16
+ find_keywords(entity, options)
17
+ end
18
+ end
19
+ def self.find_keywords(entity, options)
20
+ keywords = []
21
+ entity.each_word do |word|
22
+ found = false
23
+ options[:topic_words].each do |i, topic_words|
24
+ next if keywords.include?(word.value)
25
+ if topic_words.include?(word.value)
26
+ found = true
27
+ tf_idf = word.tf_idf
28
+ if tf_idf < options[:tf_idf_threshold]
29
+ keywords << word.value
30
+ word.set :is_keyword?, found
31
+ end
32
+ end
33
+ end
34
+ end
35
+ keywords
36
+ end
37
+ end
38
+ end
39
+ end
40
+ end
@@ -1,15 +1,16 @@
1
1
  module Treat
2
2
  module Extractors
3
3
  module Statistics
4
- class Frequency
5
- # Find the frequency of a given string value.
4
+ class FrequencyIn
5
+ DefaultOptions = {type: nil}
6
6
  def self.statistics(entity, options={})
7
+ options = DefaultOptions.merge(options)
7
8
  if entity.is_leaf?
8
9
  w = entity.value.downcase
9
- if entity.token_registry[:value][w].nil?
10
+ if entity.token_registry(options[:type])[:value][w].nil?
10
11
  0
11
12
  else
12
- entity.token_registry[:value][w].size
13
+ entity.token_registry(options[:type])[:value][w].size
13
14
  end
14
15
  else
15
16
  raise Treat::Exception,
@@ -5,11 +5,9 @@ module Treat
5
5
  # Find the frequency of a given string value.
6
6
  def self.statistics(entity, options = {})
7
7
  w = options[:value]
8
- if entity.token_registry[:value][w].nil?
9
- 0
10
- else
11
- entity.token_registry[:value][w].size
12
- end
8
+ raise Treat::Exception, "Must supply a non-nil value." unless w
9
+ entity.token_registry[:value][w].nil? ? 0 :
10
+ entity.token_registry[:value][w].size
13
11
  end
14
12
  end
15
13
  end
@@ -1,11 +1,12 @@
1
1
  module Treat
2
2
  module Extractors
3
3
  module Statistics
4
- class PositionIn
4
+ class PositionInParent
5
5
  # Find the position of the current entity
6
6
  # inside the parent entity with type entity_type.
7
- def self.statistics(entity)
8
- raise Treat::Exception, 'Could you implement this?'
7
+ # Not implemented.
8
+ def self.statistics(entity, options = {})
9
+ entity.parent.children.index(entity)
9
10
  end
10
11
  end
11
12
  end
@@ -0,0 +1,36 @@
1
+ module Treat
2
+ module Extractors
3
+ module Statistics
4
+ # "The term count in the given document is simply the
5
+ # number of times a given term appears in that document.
6
+ # This count is usually normalized to prevent a bias
7
+ # towards longer documents (which may have a higher
8
+ # term count regardless of the actual importance of
9
+ # that term in the document) to give a measure of the
10
+ # importance of the term t within the particular document d.
11
+ # Thus we have the term frequency tf(t,d), defined in the
12
+ # simplest case as the occurrence count of a term in a document.
13
+ #
14
+ # The inverse document frequency is a measure of the general
15
+ # importance of the term (obtained by dividing the total number
16
+ # of documents by the number of documents containing the term,
17
+ # and then taking the logarithm of that quotient)."
18
+ #
19
+ # (From Wikipedia)
20
+ class TfIdf
21
+ DefaultOptions = { type: nil }
22
+ def self.statistics(entity, options={})
23
+ tf = entity.frequency_in(:document)
24
+ tf = tf / entity.root.word_count
25
+ d = entity.root.document_count
26
+ i = 0
27
+ entity.root.each_document do |document|
28
+ i += 1 if document.frequency_of(entity.value)
29
+ end
30
+ idf = ::Math.log(d.to_f/(i.to_f + 1)).abs
31
+ tf.to_f/idf.to_f
32
+ end
33
+ end
34
+ end
35
+ end
36
+ end
@@ -1,23 +1,23 @@
1
1
  module Treat
2
2
  module Extractors
3
3
  module Statistics
4
+ # Experimental algorithm to generate transition matrices.
4
5
  class TransitionMatrix
5
-
6
+ DefaultOptions = {
7
+ normalize: true,
8
+ features: [:tag],
9
+ condition: lambda { |e| true },
10
+ entity_types: [:word],
11
+ relationships: [:parent, :right, :children]
12
+ }
6
13
  # Find the transition matrix.
7
14
  def self.statistics(entity, options={})
8
-
9
- normalize = options[:normalize] || true
10
- features = options[:features] || [:tag]
11
- condition = options[:condition] || lambda { |e| true }
12
- entity_types = options[:entity_types] ? options[:entity_types] :
13
- [options[:entity_type]]
14
- relationships = options[:relationships] ||
15
- [:parent, :left, :right, :children]
15
+ options = DefaultOptions.merge(options)
16
16
 
17
17
  # Create lambdas to generate the arrays.
18
- empty_prototype = {}; features.each { |f| empty_prototype[f] = {} }
18
+ empty_prototype = {}; options[:features].each { |f| empty_prototype[f] = {} }
19
19
  empty = lambda { Marshal.load(Marshal.dump(empty_prototype)) }
20
- empty2_prototype = {}; relationships.each { |r| empty2_prototype[r] = empty.call }
20
+ empty2_prototype = {}; options[:relationships].each { |r| empty2_prototype[r] = empty.call }
21
21
  empty2 = lambda { Marshal.load(Marshal.dump(empty2_prototype)) }
22
22
 
23
23
  # Deep (recursive) merger.
@@ -27,24 +27,25 @@ module Treat
27
27
 
28
28
  # Master matrix.
29
29
  mm = nil
30
+ tm = empty.call
30
31
 
31
- entity.each_entity(*entity_types) do |target|
32
-
33
- next unless condition.call(target)
32
+ entity.each_entity(*options[:entity_types]) do |target|
33
+
34
+ next unless options[:condition].call(target)
34
35
 
35
36
  # Initialize the empty transition matrix.
36
- tm = empty.call
37
+
37
38
 
38
39
  # Calculate the transition probabilities.
39
- features.each do |f1|
40
+ options[:features].each do |f1|
40
41
 
41
42
  v1 = target.send(f1)
42
43
  tm[f1][v1] = empty2.call
43
44
 
44
- relationships.each do |relationship|
45
+ options[:relationships].each do |relationship|
45
46
  tm[f1][v1][relationship] = empty.call
46
-
47
- features.each do |f2|
47
+
48
+ options[:features].each do |f2|
48
49
  relatives = target.send(relationship)
49
50
  relatives = [relatives] unless relatives.is_a? Array
50
51
  relatives.each do |relative|
@@ -55,9 +56,9 @@ module Treat
55
56
  tm[f1][v1][relationship][f2][v2] += 1.0
56
57
  end
57
58
  end
58
-
59
+
59
60
  tm[f1][v1][:edge] = empty.call
60
-
61
+
61
62
  target.edges.each do |id, edge_type|
62
63
  s = target.ancestor_with_type :sentence
63
64
  if s
@@ -68,14 +69,13 @@ module Treat
68
69
  tm[f1][v1][:edge][f2][v2] += 1.0
69
70
  end
70
71
  end
71
-
72
+
72
73
  end
73
74
  end
74
75
  end
75
-
76
- mm = mm ? mm.merge(tm, &merger) : tm
77
76
  end
78
- if normalize
77
+ mm = mm ? mm.merge(tm, &merger) : tm
78
+ if options[:normalize]
79
79
  normalize(mm)
80
80
  else
81
81
  mm