treat 0.1.2 → 0.1.3

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (100) hide show
  1. data/LICENSE +7 -8
  2. data/TODO +16 -13
  3. data/examples/keywords.rb +89 -1
  4. data/lib/treat/buildable.rb +1 -8
  5. data/lib/treat/categories.rb +3 -4
  6. data/lib/treat/category.rb +1 -1
  7. data/lib/treat/delegatable.rb +1 -1
  8. data/lib/treat/detectors/encoding/native.rb +5 -0
  9. data/lib/treat/detectors/encoding/r_chardet19.rb +2 -3
  10. data/lib/treat/detectors/language/language_detector.rb +4 -0
  11. data/lib/treat/detectors/language/what_language.rb +4 -4
  12. data/lib/treat/detectors.rb +1 -1
  13. data/lib/treat/entities/entity.rb +5 -3
  14. data/lib/treat/entities/tokens.rb +14 -5
  15. data/lib/treat/entities/zones.rb +4 -0
  16. data/lib/treat/entities.rb +7 -5
  17. data/lib/treat/extractors/keywords/topics_frequency.rb +40 -0
  18. data/lib/treat/extractors/statistics/{frequency.rb → frequency_in.rb} +5 -4
  19. data/lib/treat/extractors/statistics/frequency_of.rb +3 -5
  20. data/lib/treat/extractors/statistics/{position_in.rb → position_in_parent.rb} +4 -3
  21. data/lib/treat/extractors/statistics/tf_idf.rb +36 -0
  22. data/lib/treat/extractors/statistics/transition_matrix.rb +25 -25
  23. data/lib/treat/extractors/statistics/transition_probability.rb +8 -4
  24. data/lib/treat/extractors/time/chronic.rb +8 -0
  25. data/lib/treat/extractors/time/native.rb +6 -0
  26. data/lib/treat/extractors/time/nickel.rb +31 -23
  27. data/lib/treat/extractors/topic_words/lda.rb +21 -16
  28. data/lib/treat/extractors/topics/reuters.rb +6 -4
  29. data/lib/treat/extractors.rb +7 -7
  30. data/lib/treat/formatters/readers/abw.rb +32 -0
  31. data/lib/treat/formatters/readers/autoselect.rb +13 -11
  32. data/lib/treat/formatters/readers/doc.rb +13 -0
  33. data/lib/treat/formatters/readers/gocr.rb +2 -0
  34. data/lib/treat/formatters/readers/html.rb +21 -1
  35. data/lib/treat/formatters/readers/ocropus.rb +3 -3
  36. data/lib/treat/formatters/readers/odt.rb +41 -0
  37. data/lib/treat/formatters/readers/pdf.rb +5 -2
  38. data/lib/treat/formatters/readers/txt.rb +2 -0
  39. data/lib/treat/formatters/serializers/xml.rb +3 -2
  40. data/lib/treat/formatters/serializers/yaml.rb +2 -0
  41. data/lib/treat/formatters/unserializers/autoselect.rb +7 -1
  42. data/lib/treat/formatters/unserializers/xml.rb +6 -1
  43. data/lib/treat/formatters/unserializers/yaml.rb +5 -1
  44. data/lib/treat/formatters/visualizers/dot.rb +35 -37
  45. data/lib/treat/formatters/visualizers/html.rb +1 -0
  46. data/lib/treat/formatters/visualizers/inspect.rb +4 -0
  47. data/lib/treat/formatters/visualizers/short_value.rb +18 -3
  48. data/lib/treat/formatters/visualizers/standoff.rb +11 -6
  49. data/lib/treat/formatters/visualizers/tree.rb +5 -1
  50. data/lib/treat/formatters/visualizers/txt.rb +6 -1
  51. data/lib/treat/formatters.rb +1 -1
  52. data/lib/treat/group.rb +4 -3
  53. data/lib/treat/inflectors/cardinal_words/linguistics.rb +14 -17
  54. data/lib/treat/inflectors/conjugations/linguistics.rb +16 -3
  55. data/lib/treat/inflectors/declensions/linguistics.rb +17 -6
  56. data/lib/treat/inflectors/ordinal_words/linguistics.rb +9 -10
  57. data/lib/treat/inflectors/stem/porter.rb +6 -2
  58. data/lib/treat/inflectors/stem/porter_c.rb +4 -1
  59. data/lib/treat/inflectors/stem/uea.rb +4 -4
  60. data/lib/treat/languages/english/tags.rb +16 -0
  61. data/lib/treat/languages/english.rb +4 -1
  62. data/lib/treat/lexicalizers/category/from_tag.rb +4 -4
  63. data/lib/treat/lexicalizers/linkages/naive.rb +3 -3
  64. data/lib/treat/lexicalizers/tag/brill.rb +3 -11
  65. data/lib/treat/lexicalizers/tag/lingua.rb +4 -6
  66. data/lib/treat/lexicalizers.rb +0 -2
  67. data/lib/treat/processors/chunkers/txt.rb +4 -4
  68. data/lib/treat/processors/parsers/enju.rb +3 -17
  69. data/lib/treat/processors/parsers/stanford.rb +4 -0
  70. data/lib/treat/processors/segmenters/punkt.rb +1 -0
  71. data/lib/treat/processors/segmenters/stanford.rb +4 -0
  72. data/lib/treat/processors/segmenters/tactful.rb +4 -1
  73. data/lib/treat/processors/tokenizers/punkt.rb +1 -2
  74. data/lib/treat/processors/tokenizers/stanford.rb +4 -0
  75. data/lib/treat/processors/tokenizers/tactful.rb +1 -1
  76. data/lib/treat/processors.rb +4 -4
  77. data/lib/treat/proxies.rb +18 -11
  78. data/lib/treat/registrable.rb +12 -5
  79. data/lib/treat/sugar.rb +8 -3
  80. data/lib/treat/tree.rb +10 -3
  81. data/lib/treat.rb +55 -55
  82. data/test/tc_entity.rb +7 -7
  83. data/test/tc_extractors.rb +6 -4
  84. data/test/tc_formatters.rb +0 -4
  85. data/test/tests.rb +2 -0
  86. data/test/texts.rb +4 -4
  87. metadata +48 -56
  88. data/examples/texts/bugged_out.txt +0 -26
  89. data/examples/texts/half_cocked_basel.txt +0 -16
  90. data/examples/texts/hedge_funds.txt +0 -24
  91. data/examples/texts/hose_and_dry.txt +0 -19
  92. data/examples/texts/hungarys_troubles.txt +0 -46
  93. data/examples/texts/indias_slowdown.txt +0 -15
  94. data/examples/texts/merkozy_rides_again.txt +0 -24
  95. data/examples/texts/prada_is_not_walmart.txt +0 -9
  96. data/examples/texts/republican_nomination.txt +0 -26
  97. data/examples/texts/to_infinity_and_beyond.txt +0 -15
  98. data/lib/treat/entities/text.rb +0 -7
  99. data/lib/treat/extractors/key_sentences/topics_frequency.rb +0 -49
  100. data/lib/treat/formatters/cleaners/html.rb +0 -17
data/LICENSE CHANGED
@@ -18,11 +18,10 @@ Author: Louis-Antoine Mullie (louis.mullie@gmail.com). Copyright 2012.
18
18
  Non-trivial amount of code has been incorporated and modified from
19
19
  other libraries, specifically for the following files:
20
20
 
21
- - processors/tokenizers/macintyre.rb - Utiyama Masao (Ruby License)
22
- - processors/tokenizers/tactful.rb - Matthew Bunday (GPL license)
23
- - Inflectors/lemmatizers/e_lemma.rb - Utiyama Masao (GPL license)
24
- - processors/tokenizers/perl.rb - Todd A. Fisher (MIT License)
25
- - processors/tokenizers/punkt.rb - Steven Bird Edward Loper and Joel Nothman (Apache 2.0 license),
26
- - extractors/topics/reuters.rb - Mark Watson (GPL license)
27
- - inflectors/stemmers/porter.rb - Ray Pereda (No license information)
28
- - tree.rb - Partyl based on work by
21
+ - processors/tokenizers/macintyre.rb - Utiyama Masao (Ruby License)
22
+ - processors/tokenizers/tactful.rb - Matthew Bunday (GPL license)
23
+ - processors/tokenizers/perl.rb - Todd A. Fisher (MIT License)
24
+ - processors/tokenizers/punkt.rb - Steven Bird Edward Loper and Joel Nothman (Apache 2.0 license)
25
+ - extractors/topics/reuters.rb - Mark Watson (GPL license)
26
+ - inflectors/stemmers/porter.rb - Ray Pereda (No license information)
27
+ - tree.rb - Partly based on work by Anupam Sengupta (Creative Commons Attribution-ShareAlike Unported v. 3.0)
data/TODO CHANGED
@@ -1,25 +1,26 @@
1
1
  ## Urgent
2
2
 
3
- - Linkers
3
+ - Linkers + documentation
4
4
  - Check taggers for context
5
5
  - Stanford dependencies parse
6
- - Enju: test
7
6
  - Ocropus => use better function
8
7
  - Optimize magic methods... is_token? type methods, phrase categories.
9
- - Move statistics?
8
+ - Move statistics
10
9
  - Synset class move
11
10
  - general procedure for options, check that user doesn't want to change options...
12
- - Languages: dependencies vs. edges, PTB function tags
13
- - Check for # Fix everywhere
14
- - Check paths; parse bin paths
15
- - Ferret, Natural Inputs
16
- - Use consistently delegate
17
- - Text becomes section
18
- - Remove top level
11
+ - Ferret, Spider
19
12
  - Loading multiple JARs
13
+ - Linguistics loader, stanford loader
20
14
  - Tokenized sentences are not parsed
21
- - Documentation
22
- - Remove feature
15
+ - Dot colors
16
+ - Fix encoders
17
+ - Fix Punkt segmenter training text.
18
+ - Mark Watson's text extractor
19
+ - Statistics position in
20
+ - Fix documentation antiword, Graphviz, # encoding: utf-8
21
+ - Shortcut methods.. pre/postprocessors
22
+ - Only Phrase..
23
+ - Frequency in
23
24
 
24
25
  ## Eventually
25
26
 
@@ -52,4 +53,6 @@
52
53
  - String type detector for other languages
53
54
  - Automatic benchmark
54
55
  - Raspell spell checker
55
- - Multithreading
56
+ - Multithreading
57
+ - Mark Watson's Java NLP utility to identify proper nouns (human names and places) in text
58
+ - FastTag a Java fast part of speech tagger.
data/examples/keywords.rb CHANGED
@@ -57,4 +57,92 @@ c.each_document do |d|
57
57
  end
58
58
  end
59
59
 
60
- end
60
+ end
61
+
62
+
63
+
64
+ Treat.edulcorate
65
+ Treat.bin = '/ruby/nat/bin'
66
+
67
+ c = Collection 'economist'
68
+ c.each_document { |doc| doc.chunk.segment.tokenize }
69
+
70
+ topic_words = c.topic_words(
71
+ :lda,
72
+ :topics => 5,
73
+ :words_per_topic => 5,
74
+ :iterations => 20
75
+ )
76
+
77
+ keywords = c.keywords(
78
+ :topics_frequency,
79
+ :topic_words => topic_words,
80
+ :tf_idf_threshold => 180
81
+ )
82
+
83
+ puts keywords.inspect
84
+
85
+ abort
86
+
87
+ c = Phrase 'a test clause'
88
+ c.parse
89
+ puts c.visualize(:tree)
90
+ puts c.visualize(:inspect)
91
+ puts c.visualize(:short_value)
92
+ puts c.visualize(:standoff)
93
+ puts c.visualize(:tree)
94
+
95
+ c.serialize(:yaml).save('test.yml')
96
+ c.serialize(:xml).save('test.xml')
97
+
98
+ d = Phrase 'test.yml'
99
+ d.print_tree
100
+ d = Phrase 'test.xml'
101
+ d.print_tree
102
+
103
+ puts d.words[0].position_in_parent
104
+ abort
105
+
106
+ w = Word 'running'
107
+ puts w.stem(:porter_c)
108
+ puts w.stem(:porter)
109
+ puts w.stem(:uea)
110
+
111
+ w = Word 'run'
112
+
113
+ puts w.infinitive(:linguistics)
114
+ puts w.present_participle(:linguistics)
115
+ puts w.plural(:linguistics)
116
+
117
+ w = Word 'table'
118
+
119
+ puts w.synonyms.inspect
120
+ puts w.antonyms.inspect
121
+ puts w.hyponyms.inspect
122
+ puts w.hypernyms.inspect
123
+
124
+ n = Number 2
125
+ puts n.ordinal_words(:linguistics)
126
+ puts n.cardinal_words(:linguistics)
127
+
128
+ s = Sentence 'A sentence to parse.'
129
+ s.dup.parse(:enju).print_tree
130
+ s.dup.parse(:stanford).print_tree
131
+
132
+ s = Sentence 'A sentence to tokenize'
133
+ s.dup.tokenize(:macintyre).print_tree
134
+ s.dup.tokenize(:multilingual).print_tree
135
+ s.dup.tokenize(:perl).print_tree
136
+ s.dup.tokenize(:punkt).print_tree
137
+ s.dup.tokenize(:stanford).print_tree
138
+ s.dup.tokenize(:tactful).print_tree
139
+
140
+
141
+ =begin
142
+ c = Collection 'economist'
143
+ # c.each_document { |d| d.chunk.segment.tokenize }
144
+ c.documents[0].chunk.segment
145
+ c.sentences[0].parse(:enju)
146
+ c.each_word { |word| word.stem }
147
+ c.visualize(:dot, features: [:tag]).save('test.dot')
148
+ =end
@@ -24,9 +24,8 @@ module Treat
24
24
  "Cannot create a document or collection from " +
25
25
  "a string (need a readable file/folder)."
26
26
  end
27
- string = string.to_s
28
27
  dot = string.count('.') + string.count('!') + string.count('?')
29
- return Treat::Entities::Text.new(string) if dot > 1 ||
28
+ return Treat::Entities::Section.new(string) if dot > 1 ||
30
29
  (string.count("\n") > 0 && dot == 1)
31
30
  return Treat::Entities::Sentence.new(string) if dot == 1 && string.size > 5
32
31
  if string.count(' ') == 0
@@ -99,12 +98,6 @@ module Treat
99
98
  d.read
100
99
  end
101
100
  def from_serialized_file(file)
102
- unless [Treat::Entities::Document,
103
- Treat::Entities::Collection].include?(self)
104
- raise Treat::Exception,
105
- "Cannot create something else than a " +
106
- "document from raw file '#{file}'."
107
- end
108
101
  d = Treat::Entities::Document.new(file)
109
102
  d.unserialize
110
103
  d.children[0].set_as_root!
@@ -1,17 +1,16 @@
1
1
  module Treat
2
2
  # This module keeps track of all categories that
3
- # exist and the methods they implement, and is
4
- # responsible for including the categories.
3
+ # exist and the methods they implement.
5
4
  module Categories
6
- # A list of categories.
7
5
  class << self; attr_accessor :list; end
6
+ # Array - list of all categories.
8
7
  self.list = []
9
8
  # Boolean - does any of the categories have
10
9
  # a method that corresponds to sym?
11
10
  def self.have_method?(sym); methods.include?(sym); end
12
11
  # Cache the list of methods once it has been computed.
13
12
  @@methods = []
14
- # Provide a list of all methods implemented
13
+ # Array - provide a list of all methods implemented
15
14
  # by all Treat categories.
16
15
  def self.methods
17
16
  return @@methods unless @@methods.empty?
@@ -12,7 +12,7 @@ module Treat
12
12
  groups.each do |group|
13
13
  group = const_get(group)
14
14
  group.targets.each do |entity_type|
15
- entity = Entities.const_get(cc(entity_type))
15
+ entity = Treat::Entities.const_get(cc(entity_type))
16
16
  entity.class_eval { add_delegators group }
17
17
  end
18
18
  end
@@ -46,7 +46,7 @@ module Treat
46
46
  delegate_klass = group.const_get(:"#{cc(delegate.to_s)}")
47
47
  result = entity.accept(group, delegate_klass, m, options)
48
48
  if decorator
49
- result = group.send(decorator, self, result)
49
+ result = group.send(decorator, entity, result)
50
50
  end
51
51
  if group.type == :annotator
52
52
  f = decorator.nil? ? m : decorator
@@ -1,7 +1,12 @@
1
1
  module Treat
2
2
  module Detectors
3
3
  module Encoding
4
+ # A wrapper class for Ruby's native encoding detector.
4
5
  class Native
6
+ # Return the encoding of the entity according
7
+ # to the Ruby interpreter.
8
+ #
9
+ # Options: none.
5
10
  def self.encoding(entity, options={})
6
11
  entity.value.encoding.name.
7
12
  gsub('-', '_').downcase.intern
@@ -6,9 +6,8 @@ module Treat
6
6
  # A wrapper for the 'rchardet19' gem, which
7
7
  # detects the encoding of a file.
8
8
  class RChardet19
9
- # Returns an Encoding object representing
10
- # the encoding of the supplied entity's
11
- # text value.
9
+ # Returns the encoding of the entity according
10
+ # to the 'rchardet19' gem.
12
11
  #
13
12
  # Options: none.
14
13
  def self.encoding(entity, options={})
@@ -1,6 +1,10 @@
1
1
  module Treat
2
2
  module Detectors
3
3
  module Language
4
+ # A generic language detector, which is called before
5
+ # any language detector and ensures that configuration
6
+ # options concerning language are enforced (e.g. returns
7
+ # the default language when Treat.detect_language is false).
4
8
  class LanguageDetector
5
9
  def self.language(entity, options = {})
6
10
  if Treat.detect_language == false
@@ -7,7 +7,7 @@ module Treat
7
7
  # performs probabilistic language detection.
8
8
  class WhatLanguage < LanguageDetector
9
9
  # Keep only once instance of the gem class.
10
- @@wl = nil
10
+ @@detector = nil
11
11
  # Detect the language of an entity using the
12
12
  # 'whatlanguage' gem. Return an identifier
13
13
  # corresponding to the ISO-639-2 code for the
@@ -15,10 +15,10 @@ module Treat
15
15
  def self.language(entity, options = {})
16
16
  predetection = super(entity, options)
17
17
  return predetection if predetection
18
- @@wl ||= ::WhatLanguage.new(:all)
19
- all = @@wl.process_text(entity.to_s)
18
+ @@detector ||= ::WhatLanguage.new(:possibilities)
19
+ possibilities = @@detector.process_text(entity.to_s)
20
20
  lang = {}
21
- all.each do |k,v|
21
+ possibilities.each do |k,v|
22
22
  lang[Treat::Languages.find(k)] = v
23
23
  end
24
24
  Treat::Feature.new(lang).best
@@ -2,7 +2,7 @@ module Treat
2
2
  # Detectors detect a specific meta-information about
3
3
  # an entity, such as encoding, format and language.
4
4
  #
5
- # Detectors are language-independent, and thus they
5
+ # Detectors are language-independent, and thus there
6
6
  # are default algorithms specified for each of them.
7
7
  module Detectors
8
8
  # Group for algorithms that detect encoding.
@@ -43,7 +43,7 @@ module Treat
43
43
  # feature does not exist
44
44
  def method_missing(sym, *args, &block)
45
45
  return self.build(*args) if sym == nil
46
- if !@features[sym]
46
+ if !@features.has_key?(sym)
47
47
  r = parse_magic_method(sym, *args, &block)
48
48
  if r == :no_magic
49
49
  begin
@@ -168,7 +168,10 @@ module Treat
168
168
  def <<(entities, clear_parent = true)
169
169
  entities = [entities] unless entities.is_a? Array
170
170
  entities.each do |entity|
171
- register_token(entity) if entity.is_leaf?
171
+ if entity.is_a?(Treat::Entities::Token) ||
172
+ entity.is_a?(Treat::Entities::Constituent)
173
+ register_token(entity) unless entity.value == ''
174
+ end
172
175
  end
173
176
  super(entities)
174
177
  @parent.value = '' if has_parent?
@@ -211,7 +214,6 @@ module Treat
211
214
  def short_value(ml = 6); visualize(:short_value, :max_length => ml); end
212
215
  # Convenience functions. Convenience decorators.
213
216
  def frequency_of(word); statistics(:frequency_of, value: word); end
214
-
215
217
  private
216
218
  # Return the first element in the array, warning if not
217
219
  # the only one in the array. Used for magic methods: e.g.,
@@ -4,15 +4,24 @@ module Treat
4
4
  class Token < Entity
5
5
  # All tokens are leafs.
6
6
  def is_leaf?; true; end
7
- def frequency; self.set :frequency, statistics(:frequency); end
7
+ # Convenience function for statistics.
8
+ def frequency; statistics(:frequency_in); end
9
+ def frequency_in(type); statistics(:frequency_in, type: type); end
10
+ def position_in(type); statistics(:position_in_parent); end
11
+ def tf_idf; statistics(:tf_idf); end
8
12
  end
9
13
  # Represents a word.
10
14
  class Word < Token
11
- def infinitive(conjugator = nil); conjugate(conjugator, :mode => :infinitive); end
12
- def present_participle(conjugator = nil); conjugate(conjugator, :tense => :present, :mode => :participle); end
13
- def plural(declensor = nil); declense(declensor, :count => :plural); end
14
- def singular(declensor = nil); declense(declensor, :count => :singular); end
15
+ # Convenience function for conjugations.
16
+ def infinitive(conjugator = nil); conjugations(conjugator, :mode => :infinitive); end
17
+ # Convenience function for conjugations.
18
+ def present_participle(conjugator = nil); conjugations(conjugator, :tense => :present, :mode => :participle); end
19
+ # Convenience function for declensions.
20
+ def plural(declensor = nil); declensions(declensor, :count => :plural); end
21
+ # Convenience function for declensions.
22
+ def singular(declensor = nil); declensions(declensor, :count => :singular); end
15
23
  end
24
+ # Represents a clitic ('s).
16
25
  class Clitic < Token
17
26
  end
18
27
  # Represents a number.
@@ -13,5 +13,9 @@ module Treat
13
13
  # Represents a list.
14
14
  class List < Zone
15
15
  end
16
+ # Represents a section, usually with a title
17
+ # and at least one paragraph.
18
+ class Section < Zone
19
+ end
16
20
  end
17
21
  end
@@ -14,7 +14,6 @@ module Treat
14
14
  # Then require all possible entities.
15
15
  require 'treat/entities/collection'
16
16
  require 'treat/entities/document'
17
- require 'treat/entities/text'
18
17
  require 'treat/entities/zones'
19
18
  require 'treat/entities/sentence'
20
19
  require 'treat/entities/constituents'
@@ -25,9 +24,11 @@ module Treat
25
24
  const_get(entity).build(value, id)
26
25
  end
27
26
  end
27
+ # Cache a list of defined entity types to
28
+ # improve performance.
29
+ @@list = []
28
30
  # Provide a list of defined entity types,
29
31
  # as non-camel case identifiers.
30
- @@list = []
31
32
  def self.list
32
33
  return @@list unless @@list.empty?
33
34
  self.constants.each do |constant|
@@ -35,16 +36,17 @@ module Treat
35
36
  end
36
37
  @@list
37
38
  end
38
- # Return the 'z-order' for hierarchical
39
- # comparison of entity types.
39
+ # Return the hierarchy level of the entity
40
+ # class, the minimum being a Token and the
41
+ # maximum being a Collection.
40
42
  def self.rank(type)
41
43
  klass = Entities.const_get(cc(type))
42
44
  compare = lambda { |a,b| a == b || a < b }
43
45
  return 0 if compare.call(klass, Token)
44
46
  return 1 if compare.call(klass, Constituent)
45
47
  return 2 if compare.call(klass, Sentence)
48
+ return 3 if compare.call(klass, Zone)
46
49
  return 4 if compare.call(klass, Document)
47
- return 3 if compare.call(klass, Section)
48
50
  return 5 if compare.call(klass, Collection)
49
51
  end
50
52
  end
@@ -0,0 +1,40 @@
1
+ module Treat
2
+ module Extractors
3
+ module Keywords
4
+ class TopicsFrequency
5
+ DefaultOptions = {tf_idf_threshold: 180, topic_words: nil}
6
+ def self.keywords(entity, options = {})
7
+ options = DefaultOptions.merge(options)
8
+ unless options[:topic_words]
9
+ raise Treat::Exception, "You must supply topic words."
10
+ end
11
+ if Treat::Entities.rank(entity.type) <
12
+ Treat::Entities.rank(:sentence)
13
+ raise Treat::Exception, 'Cannot get the key ' +
14
+ 'sentences of an entity smaller than a sentence.'
15
+ else
16
+ find_keywords(entity, options)
17
+ end
18
+ end
19
+ def self.find_keywords(entity, options)
20
+ keywords = []
21
+ entity.each_word do |word|
22
+ found = false
23
+ options[:topic_words].each do |i, topic_words|
24
+ next if keywords.include?(word.value)
25
+ if topic_words.include?(word.value)
26
+ found = true
27
+ tf_idf = word.tf_idf
28
+ if tf_idf < options[:tf_idf_threshold]
29
+ keywords << word.value
30
+ word.set :is_keyword?, found
31
+ end
32
+ end
33
+ end
34
+ end
35
+ keywords
36
+ end
37
+ end
38
+ end
39
+ end
40
+ end
@@ -1,15 +1,16 @@
1
1
  module Treat
2
2
  module Extractors
3
3
  module Statistics
4
- class Frequency
5
- # Find the frequency of a given string value.
4
+ class FrequencyIn
5
+ DefaultOptions = {type: nil}
6
6
  def self.statistics(entity, options={})
7
+ options = DefaultOptions.merge(options)
7
8
  if entity.is_leaf?
8
9
  w = entity.value.downcase
9
- if entity.token_registry[:value][w].nil?
10
+ if entity.token_registry(options[:type])[:value][w].nil?
10
11
  0
11
12
  else
12
- entity.token_registry[:value][w].size
13
+ entity.token_registry(options[:type])[:value][w].size
13
14
  end
14
15
  else
15
16
  raise Treat::Exception,
@@ -5,11 +5,9 @@ module Treat
5
5
  # Find the frequency of a given string value.
6
6
  def self.statistics(entity, options = {})
7
7
  w = options[:value]
8
- if entity.token_registry[:value][w].nil?
9
- 0
10
- else
11
- entity.token_registry[:value][w].size
12
- end
8
+ raise Treat::Exception, "Must supply a non-nil value." unless w
9
+ entity.token_registry[:value][w].nil? ? 0 :
10
+ entity.token_registry[:value][w].size
13
11
  end
14
12
  end
15
13
  end
@@ -1,11 +1,12 @@
1
1
  module Treat
2
2
  module Extractors
3
3
  module Statistics
4
- class PositionIn
4
+ class PositionInParent
5
5
  # Find the position of the current entity
6
6
  # inside the parent entity with type entity_type.
7
- def self.statistics(entity)
8
- raise Treat::Exception, 'Could you implement this?'
7
+ # Not implemented.
8
+ def self.statistics(entity, options = {})
9
+ entity.parent.children.index(entity)
9
10
  end
10
11
  end
11
12
  end
@@ -0,0 +1,36 @@
1
+ module Treat
2
+ module Extractors
3
+ module Statistics
4
+ # "The term count in the given document is simply the
5
+ # number of times a given term appears in that document.
6
+ # This count is usually normalized to prevent a bias
7
+ # towards longer documents (which may have a higher
8
+ # term count regardless of the actual importance of
9
+ # that term in the document) to give a measure of the
10
+ # importance of the term t within the particular document d.
11
+ # Thus we have the term frequency tf(t,d), defined in the
12
+ # simplest case as the occurrence count of a term in a document.
13
+ #
14
+ # The inverse document frequency is a measure of the general
15
+ # importance of the term (obtained by dividing the total number
16
+ # of documents by the number of documents containing the term,
17
+ # and then taking the logarithm of that quotient)."
18
+ #
19
+ # (From Wikipedia)
20
+ class TfIdf
21
+ DefaultOptions = { type: nil }
22
+ def self.statistics(entity, options={})
23
+ tf = entity.frequency_in(:document)
24
+ tf = tf / entity.root.word_count
25
+ d = entity.root.document_count
26
+ i = 0
27
+ entity.root.each_document do |document|
28
+ i += 1 if document.frequency_of(entity.value)
29
+ end
30
+ idf = ::Math.log(d.to_f/(i.to_f + 1)).abs
31
+ tf.to_f/idf.to_f
32
+ end
33
+ end
34
+ end
35
+ end
36
+ end
@@ -1,23 +1,23 @@
1
1
  module Treat
2
2
  module Extractors
3
3
  module Statistics
4
+ # Experimental algorithm to generate transition matrices.
4
5
  class TransitionMatrix
5
-
6
+ DefaultOptions = {
7
+ normalize: true,
8
+ features: [:tag],
9
+ condition: lambda { |e| true },
10
+ entity_types: [:word],
11
+ relationships: [:parent, :right, :children]
12
+ }
6
13
  # Find the transition matrix.
7
14
  def self.statistics(entity, options={})
8
-
9
- normalize = options[:normalize] || true
10
- features = options[:features] || [:tag]
11
- condition = options[:condition] || lambda { |e| true }
12
- entity_types = options[:entity_types] ? options[:entity_types] :
13
- [options[:entity_type]]
14
- relationships = options[:relationships] ||
15
- [:parent, :left, :right, :children]
15
+ options = DefaultOptions.merge(options)
16
16
 
17
17
  # Create lambdas to generate the arrays.
18
- empty_prototype = {}; features.each { |f| empty_prototype[f] = {} }
18
+ empty_prototype = {}; options[:features].each { |f| empty_prototype[f] = {} }
19
19
  empty = lambda { Marshal.load(Marshal.dump(empty_prototype)) }
20
- empty2_prototype = {}; relationships.each { |r| empty2_prototype[r] = empty.call }
20
+ empty2_prototype = {}; options[:relationships].each { |r| empty2_prototype[r] = empty.call }
21
21
  empty2 = lambda { Marshal.load(Marshal.dump(empty2_prototype)) }
22
22
 
23
23
  # Deep (recursive) merger.
@@ -27,24 +27,25 @@ module Treat
27
27
 
28
28
  # Master matrix.
29
29
  mm = nil
30
+ tm = empty.call
30
31
 
31
- entity.each_entity(*entity_types) do |target|
32
-
33
- next unless condition.call(target)
32
+ entity.each_entity(*options[:entity_types]) do |target|
33
+
34
+ next unless options[:condition].call(target)
34
35
 
35
36
  # Initialize the empty transition matrix.
36
- tm = empty.call
37
+
37
38
 
38
39
  # Calculate the transition probabilities.
39
- features.each do |f1|
40
+ options[:features].each do |f1|
40
41
 
41
42
  v1 = target.send(f1)
42
43
  tm[f1][v1] = empty2.call
43
44
 
44
- relationships.each do |relationship|
45
+ options[:relationships].each do |relationship|
45
46
  tm[f1][v1][relationship] = empty.call
46
-
47
- features.each do |f2|
47
+
48
+ options[:features].each do |f2|
48
49
  relatives = target.send(relationship)
49
50
  relatives = [relatives] unless relatives.is_a? Array
50
51
  relatives.each do |relative|
@@ -55,9 +56,9 @@ module Treat
55
56
  tm[f1][v1][relationship][f2][v2] += 1.0
56
57
  end
57
58
  end
58
-
59
+
59
60
  tm[f1][v1][:edge] = empty.call
60
-
61
+
61
62
  target.edges.each do |id, edge_type|
62
63
  s = target.ancestor_with_type :sentence
63
64
  if s
@@ -68,14 +69,13 @@ module Treat
68
69
  tm[f1][v1][:edge][f2][v2] += 1.0
69
70
  end
70
71
  end
71
-
72
+
72
73
  end
73
74
  end
74
75
  end
75
-
76
- mm = mm ? mm.merge(tm, &merger) : tm
77
76
  end
78
- if normalize
77
+ mm = mm ? mm.merge(tm, &merger) : tm
78
+ if options[:normalize]
79
79
  normalize(mm)
80
80
  else
81
81
  mm