treat 0.1.4 → 0.2.0

Sign up to get free protection for your applications and to get access to all the features.
Files changed (160) hide show
  1. data/LICENSE +4 -4
  2. data/TODO +21 -54
  3. data/lib/economist/half_cocked_basel.txt +16 -0
  4. data/lib/economist/hose_and_dry.doc +0 -0
  5. data/lib/economist/hungarys_troubles.abw +70 -0
  6. data/lib/economist/republican_nomination.pdf +0 -0
  7. data/lib/economist/saving_the_euro.odt +0 -0
  8. data/lib/economist/to_infinity_and_beyond.txt +15 -0
  9. data/lib/economist/zero_sum.html +91 -0
  10. data/lib/treat.rb +58 -72
  11. data/lib/treat/buildable.rb +59 -15
  12. data/lib/treat/categories.rb +26 -14
  13. data/lib/treat/category.rb +2 -2
  14. data/lib/treat/delegatable.rb +65 -48
  15. data/lib/treat/doable.rb +44 -0
  16. data/lib/treat/entities.rb +34 -14
  17. data/lib/treat/entities/collection.rb +2 -0
  18. data/lib/treat/entities/document.rb +3 -2
  19. data/lib/treat/entities/entity.rb +105 -90
  20. data/lib/treat/entities/phrases.rb +17 -0
  21. data/lib/treat/entities/tokens.rb +28 -13
  22. data/lib/treat/entities/zones.rb +20 -0
  23. data/lib/treat/extractors.rb +49 -11
  24. data/lib/treat/extractors/coreferences/stanford.rb +68 -0
  25. data/lib/treat/extractors/date/chronic.rb +32 -0
  26. data/lib/treat/extractors/date/ruby.rb +25 -0
  27. data/lib/treat/extractors/keywords/tf_idf.rb +26 -0
  28. data/lib/treat/extractors/keywords/{topics_frequency.rb → topics_tf_idf.rb} +15 -7
  29. data/lib/treat/{detectors/language/language_detector.rb → extractors/language/language_extractor.rb} +5 -2
  30. data/lib/treat/extractors/language/what_language.rb +49 -0
  31. data/lib/treat/extractors/named_entity_tag/stanford.rb +53 -0
  32. data/lib/treat/extractors/roles/naive.rb +73 -0
  33. data/lib/treat/extractors/statistics/frequency_in.rb +6 -13
  34. data/lib/treat/extractors/statistics/{position_in_parent.rb → position_in.rb} +1 -1
  35. data/lib/treat/extractors/statistics/tf_idf.rb +89 -21
  36. data/lib/treat/extractors/statistics/transition_matrix.rb +11 -11
  37. data/lib/treat/extractors/statistics/transition_probability.rb +4 -4
  38. data/lib/treat/extractors/time/nickel.rb +30 -12
  39. data/lib/treat/extractors/topic_words/lda.rb +9 -9
  40. data/lib/treat/extractors/topics/reuters.rb +14 -15
  41. data/lib/treat/extractors/topics/reuters/region.xml +1 -0
  42. data/lib/treat/features.rb +7 -0
  43. data/lib/treat/formatters/readers/abw.rb +6 -1
  44. data/lib/treat/formatters/readers/autoselect.rb +5 -6
  45. data/lib/treat/formatters/readers/doc.rb +3 -1
  46. data/lib/treat/formatters/readers/html.rb +1 -1
  47. data/lib/treat/formatters/readers/image.rb +43 -0
  48. data/lib/treat/formatters/readers/odt.rb +1 -2
  49. data/lib/treat/formatters/readers/pdf.rb +9 -1
  50. data/lib/treat/formatters/readers/xml.rb +40 -0
  51. data/lib/treat/formatters/serializers/xml.rb +50 -14
  52. data/lib/treat/formatters/serializers/yaml.rb +7 -2
  53. data/lib/treat/formatters/unserializers/xml.rb +33 -7
  54. data/lib/treat/formatters/visualizers/dot.rb +90 -20
  55. data/lib/treat/formatters/visualizers/short_value.rb +2 -2
  56. data/lib/treat/formatters/visualizers/standoff.rb +2 -2
  57. data/lib/treat/formatters/visualizers/tree.rb +1 -1
  58. data/lib/treat/formatters/visualizers/txt.rb +13 -4
  59. data/lib/treat/group.rb +16 -10
  60. data/lib/treat/helpers/linguistics_loader.rb +18 -0
  61. data/lib/treat/inflectors.rb +10 -0
  62. data/lib/treat/inflectors/cardinal_words/linguistics.rb +3 -3
  63. data/lib/treat/inflectors/conjugations/linguistics.rb +5 -12
  64. data/lib/treat/inflectors/declensions/english.rb +319 -0
  65. data/lib/treat/inflectors/declensions/linguistics.rb +12 -11
  66. data/lib/treat/inflectors/ordinal_words/linguistics.rb +3 -3
  67. data/lib/treat/install.rb +59 -0
  68. data/lib/treat/kernel.rb +18 -8
  69. data/lib/treat/languages.rb +18 -11
  70. data/lib/treat/languages/arabic.rb +4 -2
  71. data/lib/treat/languages/chinese.rb +6 -2
  72. data/lib/treat/languages/dutch.rb +16 -0
  73. data/lib/treat/languages/english.rb +47 -19
  74. data/lib/treat/languages/french.rb +8 -5
  75. data/lib/treat/languages/german.rb +9 -6
  76. data/lib/treat/languages/greek.rb +16 -0
  77. data/lib/treat/languages/italian.rb +6 -3
  78. data/lib/treat/languages/polish.rb +16 -0
  79. data/lib/treat/languages/portuguese.rb +16 -0
  80. data/lib/treat/languages/russian.rb +16 -0
  81. data/lib/treat/languages/spanish.rb +16 -0
  82. data/lib/treat/languages/swedish.rb +16 -0
  83. data/lib/treat/languages/tags.rb +377 -0
  84. data/lib/treat/lexicalizers.rb +34 -23
  85. data/lib/treat/lexicalizers/category/from_tag.rb +17 -10
  86. data/lib/treat/lexicalizers/linkages/naive.rb +51 -51
  87. data/lib/treat/lexicalizers/synsets/wordnet.rb +5 -1
  88. data/lib/treat/lexicalizers/tag/brill.rb +35 -40
  89. data/lib/treat/lexicalizers/tag/lingua.rb +19 -14
  90. data/lib/treat/lexicalizers/tag/stanford.rb +59 -68
  91. data/lib/treat/lexicalizers/tag/tagger.rb +29 -0
  92. data/lib/treat/processors.rb +8 -8
  93. data/lib/treat/processors/chunkers/txt.rb +4 -4
  94. data/lib/treat/processors/parsers/enju.rb +114 -99
  95. data/lib/treat/processors/parsers/stanford.rb +109 -41
  96. data/lib/treat/processors/segmenters/punkt.rb +17 -18
  97. data/lib/treat/processors/segmenters/punkt/dutch.yaml +9716 -0
  98. data/lib/treat/processors/segmenters/punkt/english.yaml +10340 -0
  99. data/lib/treat/processors/segmenters/punkt/french.yaml +43159 -0
  100. data/lib/treat/processors/segmenters/punkt/german.yaml +9572 -0
  101. data/lib/treat/processors/segmenters/punkt/greek.yaml +6050 -0
  102. data/lib/treat/processors/segmenters/punkt/italian.yaml +14748 -0
  103. data/lib/treat/processors/segmenters/punkt/polish.yaml +9751 -0
  104. data/lib/treat/processors/segmenters/punkt/portuguese.yaml +13662 -0
  105. data/lib/treat/processors/segmenters/punkt/russian.yaml +4237 -0
  106. data/lib/treat/processors/segmenters/punkt/spanish.yaml +24034 -0
  107. data/lib/treat/processors/segmenters/punkt/swedish.yaml +10001 -0
  108. data/lib/treat/processors/segmenters/stanford.rb +38 -37
  109. data/lib/treat/processors/segmenters/tactful.rb +5 -4
  110. data/lib/treat/processors/tokenizers/macintyre.rb +7 -6
  111. data/lib/treat/processors/tokenizers/multilingual.rb +2 -3
  112. data/lib/treat/processors/tokenizers/perl.rb +2 -2
  113. data/lib/treat/processors/tokenizers/punkt.rb +6 -2
  114. data/lib/treat/processors/tokenizers/stanford.rb +25 -24
  115. data/lib/treat/processors/tokenizers/tactful.rb +1 -2
  116. data/lib/treat/proxies.rb +2 -35
  117. data/lib/treat/registrable.rb +17 -22
  118. data/lib/treat/sugar.rb +11 -11
  119. data/lib/treat/tree.rb +27 -17
  120. data/lib/treat/viewable.rb +29 -0
  121. data/lib/treat/visitable.rb +1 -1
  122. data/test/tc_entity.rb +56 -49
  123. data/test/tc_extractors.rb +41 -18
  124. data/test/tc_formatters.rb +7 -8
  125. data/test/tc_inflectors.rb +19 -24
  126. data/test/tc_lexicalizers.rb +12 -19
  127. data/test/tc_processors.rb +26 -12
  128. data/test/tc_resources.rb +2 -7
  129. data/test/tc_treat.rb +20 -22
  130. data/test/tc_tree.rb +4 -4
  131. data/test/tests.rb +3 -5
  132. data/test/texts.rb +13 -14
  133. data/tmp/INFO +1 -0
  134. metadata +78 -158
  135. data/bin/INFO +0 -1
  136. data/examples/benchmark.rb +0 -81
  137. data/examples/keywords.rb +0 -148
  138. data/lib/treat/detectors.rb +0 -31
  139. data/lib/treat/detectors/encoding/r_chardet19.rb +0 -27
  140. data/lib/treat/detectors/format/file.rb +0 -36
  141. data/lib/treat/detectors/language/what_language.rb +0 -29
  142. data/lib/treat/entities/constituents.rb +0 -15
  143. data/lib/treat/entities/sentence.rb +0 -8
  144. data/lib/treat/extractors/named_entity/abner.rb +0 -20
  145. data/lib/treat/extractors/named_entity/stanford.rb +0 -174
  146. data/lib/treat/extractors/statistics/frequency_of.rb +0 -15
  147. data/lib/treat/extractors/time/chronic.rb +0 -20
  148. data/lib/treat/extractors/time/native.rb +0 -18
  149. data/lib/treat/formatters/readers/gocr.rb +0 -26
  150. data/lib/treat/formatters/readers/ocropus.rb +0 -31
  151. data/lib/treat/formatters/visualizers/html.rb +0 -13
  152. data/lib/treat/formatters/visualizers/inspect.rb +0 -20
  153. data/lib/treat/inflectors/declensions/en.rb +0 -18
  154. data/lib/treat/languages/categories.rb +0 -5
  155. data/lib/treat/languages/english/categories.rb +0 -23
  156. data/lib/treat/languages/english/tags.rb +0 -352
  157. data/lib/treat/languages/xinhua.rb +0 -12
  158. data/lib/treat/lexicalizers/synsets/rita_wn.rb +0 -23
  159. data/lib/treat/string.rb +0 -5
  160. data/test/tc_detectors.rb +0 -26
@@ -2,20 +2,13 @@ module Treat
2
2
  module Extractors
3
3
  module Statistics
4
4
  class FrequencyIn
5
- DefaultOptions = {type: nil}
6
- def self.statistics(entity, options={})
5
+ DefaultOptions = { :parent => nil }
6
+ # Find the frequency of a given string value.
7
+ def self.statistics(entity, options = {})
7
8
  options = DefaultOptions.merge(options)
8
- if entity.is_leaf?
9
- w = entity.value.downcase
10
- if entity.token_registry(options[:type])[:value][w].nil?
11
- 0
12
- else
13
- entity.token_registry(options[:type])[:value][w].size
14
- end
15
- else
16
- raise Treat::Exception,
17
- 'Cannot get the frequency of a non-terminal entity.'
18
- end
9
+ tr = entity.token_registry(options[:parent])
10
+ tv = tr[:value][entity.value]
11
+ tv ? tv.size : 1
19
12
  end
20
13
  end
21
14
  end
@@ -6,7 +6,7 @@ module Treat
6
6
  # inside the parent entity with type entity_type.
7
7
  # Not implemented.
8
8
  def self.statistics(entity, options = {})
9
- entity.parent.children.index(entity)
9
+ entity.parent.children.index(entity) ## Fix - ancestor_w_type
10
10
  end
11
11
  end
12
12
  end
@@ -1,34 +1,102 @@
1
1
  module Treat
2
2
  module Extractors
3
3
  module Statistics
4
- # "The term count in the given document is simply the
5
- # number of times a given term appears in that document.
6
- # This count is usually normalized to prevent a bias
7
- # towards longer documents (which may have a higher
8
- # term count regardless of the actual importance of
9
- # that term in the document) to give a measure of the
10
- # importance of the term t within the particular document d.
11
- # Thus we have the term frequency tf(t,d), defined in the
4
+ # "The term count in the given document is simply the
5
+ # number of times a given term appears in that document.
6
+ # This count is usually normalized to prevent a bias
7
+ # towards longer documents (which may have a higher
8
+ # term count regardless of the actual importance of
9
+ # that term in the document) to give a measure of the
10
+ # importance of the term t within the particular document d.
11
+ # Thus we have the term frequency tf(t,d), defined in the
12
12
  # simplest case as the occurrence count of a term in a document.
13
- #
14
- # The inverse document frequency is a measure of the general
15
- # importance of the term (obtained by dividing the total number
16
- # of documents by the number of documents containing the term,
13
+ #
14
+ # The inverse document frequency is a measure of the general
15
+ # importance of the term (obtained by dividing the total number
16
+ # of documents by the number of documents containing the term,
17
17
  # and then taking the logarithm of that quotient)."
18
18
  #
19
19
  # (From Wikipedia)
20
20
  class TfIdf
21
- DefaultOptions = { type: nil }
21
+ DefaultOptions = {
22
+ :tf => :natural,
23
+ :idf => :logarithm,
24
+ :remove_common_words => true,
25
+ :precision => 4
26
+ }
27
+ Algorithms = {
28
+ :tf => {
29
+ :natural => lambda { |tf| tf },
30
+ :logarithm => lambda { |tf| Math.log(1 + tf) },
31
+ :sqrt =>lambda { |tf| Math.sqrt(tf) }
32
+ },
33
+ :idf => {
34
+ :logarithm => lambda { |n,df| Math.log(n/(1 + df)) },
35
+ :none => lambda { |n,idf| 1 }
36
+ }
37
+ }
38
+ # Optimization caches for tf idf.
39
+ @@n = {} # Number of documents in the collection (n).
40
+ @@df= {} # Number of documents that have a given value (document count).
41
+ @@f = {} # Number of times a word appears in a given document (term count).
42
+ @@wc = {} # Number of words in a given document (word count).
43
+ @@cw = {} # Common words to filter out.
22
44
  def self.statistics(entity, options={})
23
- tf = entity.frequency_in(:document)
24
- tf = tf / entity.root.word_count
25
- d = entity.root.document_count
26
- i = 0
27
- entity.root.each_document do |document|
28
- i += 1 if document.frequency_of(entity.value)
45
+ l = Treat::Languages.get(entity.language)
46
+ if l.const_defined?(:CommonWords)
47
+ @@cw[entity.language] = l.const_get(:CommonWords)
48
+ return 0 if @@cw[entity.language].include?(entity.value)
49
+ end
50
+ return 0 if entity.value.length <= 2
51
+ options = DefaultOptions.merge(options)
52
+ lambdas = options.partition do |k,v|
53
+ [:tf, :idf, :normalization].include?(k)
54
+ end[0]
55
+ lambdas.each do |opt,val|
56
+ if opt.is_a?(Symbol)
57
+ if Algorithms[opt][val]
58
+ options[opt] = Algorithms[opt][val]
59
+ else
60
+ raise Treat::Exception,
61
+ "The specified algorithm '#{val}' "+
62
+ "to calculate #{opt} does not exist."
63
+ end
64
+ end
65
+ end
66
+ collection = entity.parent_collection
67
+ document = entity.parent_document
68
+ dc = collection.document_count
69
+ if !collection || !document
70
+ raise Treat::Exception,
71
+ "Tf*Idf requires a collection with documents."
72
+ end
73
+ val = entity.value.downcase
74
+ @@n[collection.id] = dc if @@n[collection.id].nil?
75
+ @@df[collection.id] ||= {}
76
+ if @@df[collection.id][val].nil?
77
+ df = 0
78
+ collection.each_document do |doc|
79
+ @@f[doc.id] ||= {}
80
+ if @@f[doc.id][val].nil?
81
+ @@f[doc.id][val] =
82
+ doc.token_registry[:value][val] ?
83
+ doc.token_registry[:value][val].size : 0
84
+ end
85
+ df += 1 if @@f[doc.id][val] > 0
86
+ end
87
+ @@df[collection.id][val] = df
88
+ end
89
+ f = @@f[document.id][entity.value].to_f
90
+ df = @@df[collection.id][entity.value].to_f
91
+ tf = options[:tf].call(f).to_f
92
+ if options[:normalize_word_count]
93
+ @@wc[document.id] ||= document.word_count
94
+ tf /= @@wc[document.id]
29
95
  end
30
- idf = ::Math.log(d.to_f/(i.to_f + 1)).abs
31
- tf.to_f/idf.to_f
96
+ n = @@n[collection.id].to_f
97
+ idf = options[:idf].call(n, df)
98
+ tf_idf = tf * idf
99
+ tf_idf.abs.round(options[:precision])
32
100
  end
33
101
  end
34
102
  end
@@ -4,11 +4,11 @@ module Treat
4
4
  # Experimental algorithm to generate transition matrices.
5
5
  class TransitionMatrix
6
6
  DefaultOptions = {
7
- normalize: true,
8
- features: [:tag],
9
- condition: lambda { |e| true },
10
- entity_types: [:word],
11
- relationships: [:parent, :right, :children]
7
+ :normalize => true,
8
+ :features => [:tag],
9
+ :condition => lambda { |e| true },
10
+ :entity_types => [:word],
11
+ :relationships => [:parent, :right, :children]
12
12
  }
13
13
  # Find the transition matrix.
14
14
  def self.statistics(entity, options={})
@@ -34,7 +34,7 @@ module Treat
34
34
  next unless options[:condition].call(target)
35
35
 
36
36
  # Initialize the empty transition matrix.
37
-
37
+
38
38
 
39
39
  # Calculate the transition probabilities.
40
40
  options[:features].each do |f1|
@@ -57,16 +57,16 @@ module Treat
57
57
  end
58
58
  end
59
59
 
60
- tm[f1][v1][:edge] = empty.call
60
+ tm[f1][v1][:dependency] = empty.call
61
61
 
62
- target.edges.each do |id, edge_type|
62
+ target.dependencies.each do |dependency|
63
63
  s = target.ancestor_with_type :sentence
64
64
  if s
65
- x = s.find(id)
65
+ x = s.find(dependency.target)
66
66
  next unless relative.has?(f2)
67
67
  v2 = x.send(f2)
68
- tm[f1][v1][:edge][f2][v2] ||= 0.0
69
- tm[f1][v1][:edge][f2][v2] += 1.0
68
+ tm[f1][v1][:dependency][f2][v2] ||= 0.0
69
+ tm[f1][v1][:dependency][f2][v2] += 1.0
70
70
  end
71
71
  end
72
72
 
@@ -34,14 +34,14 @@ module Treat
34
34
  end
35
35
  end
36
36
 
37
- entity.edges.each do |id, edge|
37
+ entity.dependencies.each do |dependency|
38
38
  s = entity.ancestor_with_type :sentence
39
39
  if s
40
- x = s.find(id)
40
+ x = s.find(dependency.target)
41
41
  next unless h.has?(f2)
42
42
  v2 = x.send(f2)
43
- if tm[f1][v1][:edge][f2][v2]
44
- score += tm[f1][v1][:edge][f2][v2]
43
+ if tm[f1][v1][:dependency][f2][v2]
44
+ score += tm[f1][v1][:dependency][f2][v2]
45
45
  count += 1
46
46
  end
47
47
  end
@@ -1,7 +1,7 @@
1
1
  module Treat
2
2
  module Extractors
3
3
  module Time
4
- # A wrapper for the 'nickel' gem, which parses
4
+ # A wrapper for the 'nickel' gem, which parses
5
5
  # times and dates and supplies additional information
6
6
  # concerning these. The additional information supplied
7
7
  # that this class annotates entities with is:
@@ -11,7 +11,7 @@ module Treat
11
11
  # - start_time: a DateTime object representing the beginning of
12
12
  # an event.
13
13
  # - end_time: a DateTime object representing the end of an event.
14
- #
14
+ #
15
15
  # Examples of values for time_recurrence are:
16
16
  #
17
17
  # - single: "lunch with megan tomorrow at noon"
@@ -19,33 +19,51 @@ module Treat
19
19
  # - weekly: "math class every wed from 8-11am"
20
20
  # - daymonthly: "open bar at joes the first friday of every month"
21
21
  # - datemonthly: "pay credit card bill on the 22nd of each month"
22
- #
22
+ #
23
23
  # Project website: http://naturalinputs.com/
24
- module Nickel
24
+ class Nickel
25
25
  require 'date'
26
26
  silence_warnings { require 'nickel' }
27
27
  # Extract time information from a bit of text.
28
28
  def self.time(entity, options = {})
29
- n = silence_warnings { ::Nickel.parse(entity.to_s) }
29
+ return nil if entity.to_s.strip == ''
30
+ n = nil
31
+ silence_warnings { n = ::Nickel.parse(entity.to_s.strip) }
30
32
  occ = n.occurrences[0]
33
+ return nil unless occ
31
34
 
32
35
  rec = occ.type.to_s.gsub('single', 'once').intern
33
- entity.set :time_recurrence, rec
36
+ time_recurrence = rec
34
37
  interval = occ.interval ? occ.interval : :none
35
- entity.set :time_recurrence_interval, interval
38
+ time_recurrence_interval = interval
36
39
 
37
40
  s = [occ.start_date, occ.start_time]
38
41
  ds = [s[0].year, s[0].month, s[0].day] if s[0]
39
- #ts = [s[1].hour, s[1].min, s[1].sec] if s[1]
42
+ ts = [s[1].hour, s[1].minute, s[1].second] if s[1]
40
43
 
41
44
  e = [occ.end_date, occ.end_time]
42
45
  de = [e[0].year, e[0].month, e[0].day] if e[0]
43
- #te = [e[1].hour, e[1].min, e[1].sec] if e[1]
46
+ te = [e[1].hour, e[1].minute, e[1].second] if e[1]
47
+
48
+ start_time = ::DateTime.civil(*ds) if ds && !ts
49
+ start_time = ::DateTime.civil(*ds, *ts) if ds && ts
50
+ end_time = ::DateTime.civil(*de) if de && !te
51
+ end_time = ::DateTime.civil(*de, *te) if de && te
44
52
 
45
- entity.set :start_time, ::DateTime.civil(*ds) if ds
46
- entity.set :end_time, ::DateTime.civil(*de) if de
53
+ time = Treat::Features::Time.new( # Fix - time message.
54
+ start_time, end_time, time_recurrence,
55
+ time_recurrence_interval
56
+ )
47
57
 
48
- entity.start_time
58
+ # Keeps the lowest-level time annotations
59
+ # that do not conflict with the highest-level
60
+ # time annotation.
61
+ entity.ancestors_with_type(:phrase).each do |a|
62
+ unless a.id == entity.id || a.children[0].size == 0
63
+ a.unset(:time)
64
+ end
65
+ end
66
+ time
49
67
  end
50
68
  end
51
69
  end
@@ -29,26 +29,26 @@ module Treat
29
29
  end
30
30
  # Default options for the LDA algorithm.
31
31
  DefaultOptions = {
32
- topics: 20,
33
- words_per_topic: 10,
34
- iterations: 20
32
+ :num_topics => 20,
33
+ :words_per_topic => 10,
34
+ :iterations => 20
35
35
  }
36
36
  # Retrieve the topic words of a collection.
37
37
  def self.topic_words(collection, options = {})
38
38
  options = DefaultOptions.merge(options)
39
39
  # Create a corpus with the collection
40
40
  sections = collection.sections.collect do |t|
41
- t.to_s.encode_compliant('UTF-8') # fix
41
+ t.to_s.encode('UTF-8', :invalid => :replace,
42
+ :undef => :replace, :replace => "?") # Fix
42
43
  end
43
44
  corpus = Lda::TextCorpus.new(sections)
44
45
 
45
46
  # Create an Lda object for training
46
47
  lda = Lda::Lda.new(corpus)
47
- lda.num_topics = options[:topics]
48
+ lda.num_topics = options[:num_topics]
48
49
  lda.max_iter = options[:iterations]
49
50
  # Run the EM algorithm using random starting points
50
- silence_streams(STDOUT, STDERR) { lda.em('random') }
51
-
51
+ silence_stdout { lda.em('random') }
52
52
  # Load the vocabulary.
53
53
  if options[:vocabulary]
54
54
  lda.load_vocabulary(options[:vocabulary])
@@ -57,8 +57,8 @@ module Treat
57
57
  # Get the topic words and annotate the section.
58
58
  topic_words = lda.top_words(options[:words_per_topic])
59
59
 
60
- topic_words.each do |i, words|
61
- collection.each_word do |word|
60
+ collection.each_word do |word|
61
+ topic_words.each do |i, words|
62
62
  if words.include?(word)
63
63
  word.set :is_topic_word?, true
64
64
  word.set :topic_id, i
@@ -22,24 +22,27 @@ module Treat
22
22
  def self.topics(text, options = {})
23
23
  stems = []
24
24
  @@reduce = 0
25
- text.to_s.tokenize.words.collect! do |tok|
25
+ unless text.words.size > 0
26
+ raise Treat::Exception,
27
+ "Annotator 'topics' requires processor 'tokenize'."
28
+ end
29
+ text.words.collect! do |tok|
26
30
  stem = tok.stem.downcase
27
31
  val = tok.value.downcase
28
32
  stems << stem
29
33
  unless stem == val
30
34
  stems << val
31
- @@reduce += 1
32
35
  end
33
36
  end
34
37
  get_topics
35
- topics = score_words(@@industry, stems)
36
- topics = topics.merge(score_words(@@region, stems))
37
- topics = topics.merge(score_words(@@topics, stems))
38
+ score_words(@@industry, stems) +
39
+ score_words(@@region, stems) +
40
+ score_words(@@topics, stems)
38
41
  #Treat::Feature.new(topics)
39
42
  end
40
43
  # Read the topics from the XML files.
41
44
  def self.get_topics
42
- return unless @@industry.empty?
45
+ return unless @@industry.size == 0
43
46
  @@industry = read_xml(Treat.lib + '/treat/extractors/topics/reuters/industry.xml')
44
47
  @@region = read_xml(Treat.lib + '/treat/extractors/topics/reuters/region.xml')
45
48
  @@topics = read_xml(Treat.lib + '/treat/extractors/topics/reuters/topics.xml')
@@ -65,21 +68,17 @@ module Treat
65
68
  count_hash[cat_name] ||= 0
66
69
  word_list.each do |word|
67
70
  unless hash[cat_name][word].nil?
68
- count_hash[cat_name] =
69
- count_hash[cat_name] +
71
+ count_hash[cat_name] +=
70
72
  hash[cat_name][word]
71
73
  end
72
74
  end
73
75
  end
74
- count_hash = best_of_hash(count_hash,
75
- (word_list.size.to_f - @@reduce.to_f) / 250.0,
76
- 100.0 / (1 + word_list.size.to_f - @@reduce.to_f))
77
- count_hash
76
+ count_hash = best_of_hash(count_hash)
77
+ count_hash.keys
78
78
  end
79
- def self.best_of_hash(hash, cutoff = 1, scale = 1)
80
- cutoff = 1 if cutoff == 0
79
+ def self.best_of_hash(hash, cutoff = 0.0, scale = 1.0)
81
80
  ret = {}
82
- hash.keys.each() do |key|
81
+ hash.keys.each do |key|
83
82
  if hash[key] > cutoff
84
83
  ret[key] = hash[key] * scale
85
84
  ret[key] = ret[key].round(2)
@@ -13437,6 +13437,7 @@
13437
13437
  <word cat="CANADA" name="nuinsco" score="1.000000" />
13438
13438
  <word cat="CANADA" name="noverco" score="1.000000" />
13439
13439
  <word cat="CANADA" name="enscor" score="1.000000" />
13440
+ <word cat="CANADA" name="ottawa" score="1.000000" />
13440
13441
  <word cat="CANADA" name="winnipegg" score="1.000000" />
13441
13442
  <word cat="CANADA" name="mantadoc" score="1.000000" />
13442
13443
  <word cat="CANADA" name="canmar" score="1.000000" />
@@ -0,0 +1,7 @@
1
+ module Treat
2
+ module Features
3
+ Time = Struct.new(:start, :end, :recurrence, :recurrence_interval)
4
+ Roles = Struct.new(:subject, :verb, :object, :patient, :agent)
5
+ Date = Struct.new(:year, :month, :day)
6
+ end
7
+ end
@@ -18,7 +18,12 @@ module Treat
18
18
  end
19
19
  def text(s)
20
20
  if s != 'AbiWord' && s != 'application/x-abiword'
21
- @plain_text << s if s.strip.length > 0
21
+ s.strip!
22
+ if s.length > 0
23
+ s += ' '
24
+ s += "\n\n" if s.length < 60
25
+ end
26
+ @plain_text << s
22
27
  end
23
28
  end
24
29
  end
@@ -6,10 +6,8 @@ module Treat
6
6
  # the appropriate reader based on the file
7
7
  # extension of the supplied document.
8
8
  class Autoselect
9
- # A list of image extensions that should be routed to OCR.
9
+ # A list of image extensions that should be routed to Ocropus.
10
10
  ImageExtensions = ['gif', 'jpg', 'jpeg', 'png']
11
- # Default options.
12
- DefaultOptions = {:ocr => :ocropus}
13
11
  # Select the appropriate reader based on the format
14
12
  # of the filename in document.
15
13
  #
@@ -17,19 +15,20 @@ module Treat
17
15
  #
18
16
  # - :ocr_engine => :ocropus or :gocr (the OCR engine to use).
19
17
  def self.read(document, options)
20
- options = DefaultOptions.merge(options)
21
18
  ext = document.file.split('.')[-1]
22
- reader = ImageExtensions.include?(ext) ? 'ocropus' : ext
19
+ reader = ImageExtensions.include?(ext) ? 'image' : ext
23
20
  reader = 'html' if reader == 'htm'
24
21
  reader = 'yaml' if reader == 'yml'
25
22
  begin
26
23
  r = Treat::Formatters::Readers.const_get(cc(reader))
27
- rescue NameError => e
24
+ rescue NameError
28
25
  puts e.message
29
26
  raise Treat::Exception,
30
27
  "Cannot find a reader for format: '#{ext}'."
31
28
  end
32
29
  document = r.read(document, options)
30
+ document.set :encoding, document.to_s.encoding.to_s.downcase
31
+ document
33
32
  end
34
33
  end
35
34
  end