treat 0.1.2 → 0.1.3

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (100) hide show
  1. data/LICENSE +7 -8
  2. data/TODO +16 -13
  3. data/examples/keywords.rb +89 -1
  4. data/lib/treat/buildable.rb +1 -8
  5. data/lib/treat/categories.rb +3 -4
  6. data/lib/treat/category.rb +1 -1
  7. data/lib/treat/delegatable.rb +1 -1
  8. data/lib/treat/detectors/encoding/native.rb +5 -0
  9. data/lib/treat/detectors/encoding/r_chardet19.rb +2 -3
  10. data/lib/treat/detectors/language/language_detector.rb +4 -0
  11. data/lib/treat/detectors/language/what_language.rb +4 -4
  12. data/lib/treat/detectors.rb +1 -1
  13. data/lib/treat/entities/entity.rb +5 -3
  14. data/lib/treat/entities/tokens.rb +14 -5
  15. data/lib/treat/entities/zones.rb +4 -0
  16. data/lib/treat/entities.rb +7 -5
  17. data/lib/treat/extractors/keywords/topics_frequency.rb +40 -0
  18. data/lib/treat/extractors/statistics/{frequency.rb → frequency_in.rb} +5 -4
  19. data/lib/treat/extractors/statistics/frequency_of.rb +3 -5
  20. data/lib/treat/extractors/statistics/{position_in.rb → position_in_parent.rb} +4 -3
  21. data/lib/treat/extractors/statistics/tf_idf.rb +36 -0
  22. data/lib/treat/extractors/statistics/transition_matrix.rb +25 -25
  23. data/lib/treat/extractors/statistics/transition_probability.rb +8 -4
  24. data/lib/treat/extractors/time/chronic.rb +8 -0
  25. data/lib/treat/extractors/time/native.rb +6 -0
  26. data/lib/treat/extractors/time/nickel.rb +31 -23
  27. data/lib/treat/extractors/topic_words/lda.rb +21 -16
  28. data/lib/treat/extractors/topics/reuters.rb +6 -4
  29. data/lib/treat/extractors.rb +7 -7
  30. data/lib/treat/formatters/readers/abw.rb +32 -0
  31. data/lib/treat/formatters/readers/autoselect.rb +13 -11
  32. data/lib/treat/formatters/readers/doc.rb +13 -0
  33. data/lib/treat/formatters/readers/gocr.rb +2 -0
  34. data/lib/treat/formatters/readers/html.rb +21 -1
  35. data/lib/treat/formatters/readers/ocropus.rb +3 -3
  36. data/lib/treat/formatters/readers/odt.rb +41 -0
  37. data/lib/treat/formatters/readers/pdf.rb +5 -2
  38. data/lib/treat/formatters/readers/txt.rb +2 -0
  39. data/lib/treat/formatters/serializers/xml.rb +3 -2
  40. data/lib/treat/formatters/serializers/yaml.rb +2 -0
  41. data/lib/treat/formatters/unserializers/autoselect.rb +7 -1
  42. data/lib/treat/formatters/unserializers/xml.rb +6 -1
  43. data/lib/treat/formatters/unserializers/yaml.rb +5 -1
  44. data/lib/treat/formatters/visualizers/dot.rb +35 -37
  45. data/lib/treat/formatters/visualizers/html.rb +1 -0
  46. data/lib/treat/formatters/visualizers/inspect.rb +4 -0
  47. data/lib/treat/formatters/visualizers/short_value.rb +18 -3
  48. data/lib/treat/formatters/visualizers/standoff.rb +11 -6
  49. data/lib/treat/formatters/visualizers/tree.rb +5 -1
  50. data/lib/treat/formatters/visualizers/txt.rb +6 -1
  51. data/lib/treat/formatters.rb +1 -1
  52. data/lib/treat/group.rb +4 -3
  53. data/lib/treat/inflectors/cardinal_words/linguistics.rb +14 -17
  54. data/lib/treat/inflectors/conjugations/linguistics.rb +16 -3
  55. data/lib/treat/inflectors/declensions/linguistics.rb +17 -6
  56. data/lib/treat/inflectors/ordinal_words/linguistics.rb +9 -10
  57. data/lib/treat/inflectors/stem/porter.rb +6 -2
  58. data/lib/treat/inflectors/stem/porter_c.rb +4 -1
  59. data/lib/treat/inflectors/stem/uea.rb +4 -4
  60. data/lib/treat/languages/english/tags.rb +16 -0
  61. data/lib/treat/languages/english.rb +4 -1
  62. data/lib/treat/lexicalizers/category/from_tag.rb +4 -4
  63. data/lib/treat/lexicalizers/linkages/naive.rb +3 -3
  64. data/lib/treat/lexicalizers/tag/brill.rb +3 -11
  65. data/lib/treat/lexicalizers/tag/lingua.rb +4 -6
  66. data/lib/treat/lexicalizers.rb +0 -2
  67. data/lib/treat/processors/chunkers/txt.rb +4 -4
  68. data/lib/treat/processors/parsers/enju.rb +3 -17
  69. data/lib/treat/processors/parsers/stanford.rb +4 -0
  70. data/lib/treat/processors/segmenters/punkt.rb +1 -0
  71. data/lib/treat/processors/segmenters/stanford.rb +4 -0
  72. data/lib/treat/processors/segmenters/tactful.rb +4 -1
  73. data/lib/treat/processors/tokenizers/punkt.rb +1 -2
  74. data/lib/treat/processors/tokenizers/stanford.rb +4 -0
  75. data/lib/treat/processors/tokenizers/tactful.rb +1 -1
  76. data/lib/treat/processors.rb +4 -4
  77. data/lib/treat/proxies.rb +18 -11
  78. data/lib/treat/registrable.rb +12 -5
  79. data/lib/treat/sugar.rb +8 -3
  80. data/lib/treat/tree.rb +10 -3
  81. data/lib/treat.rb +55 -55
  82. data/test/tc_entity.rb +7 -7
  83. data/test/tc_extractors.rb +6 -4
  84. data/test/tc_formatters.rb +0 -4
  85. data/test/tests.rb +2 -0
  86. data/test/texts.rb +4 -4
  87. metadata +48 -56
  88. data/examples/texts/bugged_out.txt +0 -26
  89. data/examples/texts/half_cocked_basel.txt +0 -16
  90. data/examples/texts/hedge_funds.txt +0 -24
  91. data/examples/texts/hose_and_dry.txt +0 -19
  92. data/examples/texts/hungarys_troubles.txt +0 -46
  93. data/examples/texts/indias_slowdown.txt +0 -15
  94. data/examples/texts/merkozy_rides_again.txt +0 -24
  95. data/examples/texts/prada_is_not_walmart.txt +0 -9
  96. data/examples/texts/republican_nomination.txt +0 -26
  97. data/examples/texts/to_infinity_and_beyond.txt +0 -15
  98. data/lib/treat/entities/text.rb +0 -7
  99. data/lib/treat/extractors/key_sentences/topics_frequency.rb +0 -49
  100. data/lib/treat/formatters/cleaners/html.rb +0 -17
@@ -2,62 +2,54 @@ module Treat
2
2
  module Formatters
3
3
  module Visualizers
4
4
  class Dot
5
- # Border colors to use for different POS tags.
6
- BorderColors = {
7
- :verb => "#00AABB",
8
- :noun => "#FAD4A7",
9
- :adverb => '#103585',
10
- :adjective => '#D21D54'
11
- }
5
+ DefaultOptions = {colors: {}, :features => :all}
12
6
  # Create the top-most graph structure
13
7
  # and delegate the creation of the graph
14
8
  # nodes to to_dot.
15
9
  def self.visualize(entity, options = {})
10
+ options = DefaultOptions.merge(options)
16
11
  string = "graph {"
17
- string << self.to_dot(entity)
12
+ string << self.to_dot(entity, options)
18
13
  string << "\n}"
19
14
  end
20
15
  # dot -Tpdf test4.dot > test4.pdf
21
- def self.to_dot(entity)
16
+ def self.to_dot(entity, options)
17
+ # Id
22
18
  string = ''
23
- if entity.is_leaf?
24
- if entity.is_a?(Treat::Entities::Word)
25
- label = "label=\"#{entity.value} (#{entity.tag})\","
26
- label << "color=\"#{BorderColors[entity.cat]}\","
27
- else
28
- label = "label=\"#{entity.value.inspect[1..-2]}\","
29
- end
19
+ label = ''
20
+ string = "\n#{entity.id} ["
21
+ # Value
22
+ if entity.is_a?(Treat::Entities::Token)
23
+ label = entity.to_s
30
24
  else
31
- if entity.class < Entities::Constituent
32
- label = "label=\"#{entity.tag}\","
33
- # label << "color=\"#{BorderColors[entity.tag]}\","
34
- else
35
- label = "label=\"#{cc(cl(entity.class))}\","
25
+ label = entity.type.to_s.capitalize + " "
26
+ if entity.is_leaf?
27
+ label = entity.short_value.gsub(' [...]', " [...] \\n")
36
28
  end
37
29
  end
38
- string << "\n#{entity.id} ["
30
+ # Features
39
31
  if entity.has_features?
40
- string << label
41
- entity.features.each_pair do |feature, value|
42
- if value.is_a?(Treat::Entities::Entity)
43
- string << "#{feature}=\"#{value.id}\","
44
- else
45
- string << "#{feature}=\"#{value}\","
32
+ unless options[:features] == :none
33
+ label << "\\n"
34
+ entity.features.each do |feature, value|
35
+ if options[:features] == :all ||
36
+ options[:features].include?(feature)
37
+ if value.is_a?(Treat::Entities::Entity)
38
+ label << "\\n#{feature}=\\\"*#{value.id}\\\","
39
+ else
40
+ label << "\\n#{feature}=\\\"#{value}\\\","
41
+ end
42
+ end
46
43
  end
47
44
  end
48
- string = string[0..-2]
49
- string << "]"
50
- else
51
- string << "#{label[0..-2]}]"
52
45
  end
46
+ label = label[0..-2] if label[-1] == ','
47
+ string << "label=\"#{label}\"]"
48
+ # Parent-child relationships.
53
49
  if entity.has_parent?
54
50
  string << "\n#{entity.parent.id} -- #{entity.id};"
55
51
  end
56
- if entity.has_children?
57
- entity.each do |child|
58
- string << self.to_dot(child)
59
- end
60
- end
52
+ # Edges.
61
53
  if entity.has_edges?
62
54
  entity.edges.each_pair do |target, type|
63
55
  string << "\n#{entity.id} -- #{target}"
@@ -65,6 +57,12 @@ module Treat
65
57
  string << "arrowhead=\"odiamond\"]"
66
58
  end
67
59
  end
60
+ # Recurse.
61
+ if entity.has_children?
62
+ entity.each do |child|
63
+ string << self.to_dot(child, options)
64
+ end
65
+ end
68
66
  string
69
67
  end
70
68
  end
@@ -5,6 +5,7 @@ module Treat
5
5
  class HTML
6
6
  # Not implemented yet.
7
7
  def self.visualize(entity, options = {})
8
+ raise 'Not implemented yet.'
8
9
  end
9
10
  end
10
11
  end
@@ -1,7 +1,11 @@
1
1
  module Treat
2
2
  module Formatters
3
3
  module Visualizers
4
+ # Handles the call to inspect.
4
5
  class Inspect
6
+ # Return a terminal-friendly visualization of an entity.
7
+ #
8
+ # Options: none.
5
9
  def self.visualize(entity, options = {})
6
10
  s = "#{entity.class.to_s.split('::')[-1]} (#{entity.id.to_s})"
7
11
  unless caller_method == :inspect
@@ -2,11 +2,26 @@ module Treat
2
2
  module Formatters
3
3
  module Visualizers
4
4
  class ShortValue
5
+ # Default options for the visualizer.
6
+ DefaultOptions = { max_words: 6, max_length: 30 }
7
+ # Returns the text value of an entity, shortend
8
+ # with [..] if the value is longer than :max_words
9
+ # or longer than :max_length.
10
+ #
11
+ # Options:
12
+ # - (Integer) :max_words => the maximum number
13
+ # of words in an entity before it is shortened.
14
+ # - (Integer) :max_length => the maximum number
15
+ # of characters in an entity before it is shortened.s
5
16
  def self.visualize(entity, options = {})
6
- options[:max_length] ||= 6
17
+ options = DefaultOptions.merge(options)
7
18
  words = entity.to_s.split(' ')
8
- return entity.to_s if words.size < options[:max_length]
9
- words[0..2].join(' ') + ' [...] ' + words[-3..-1].join(' ')
19
+ if words.size < options[:max_words] ||
20
+ entity.to_s.length < options[:max_length]
21
+ entity.to_s
22
+ else
23
+ words[0..2].join(' ') + ' [...] ' + words[-3..-1].join(' ')
24
+ end
10
25
  end
11
26
  end
12
27
  end
@@ -5,24 +5,29 @@ module Treat
5
5
  # an entity in standoff format; for example:
6
6
  # (S (NP John) (VP has (VP come))).
7
7
  class Standoff
8
- Recurse = Proc.new do |entity, options|
8
+ # Default options for the visualizer.
9
+ DefaultOptions = { indent: 0 }
10
+ # A lambda to recursively visualize the children
11
+ # of an entity.
12
+ Recurse = lambda do |entity, options|
9
13
  v = ''
10
14
  entity.each { |child| v += visualize(child, options) }
11
15
  v
12
16
  end
13
17
  # Visualize the entity using standoff notation.
14
- # This can only be called on sentences, as it
15
- # is not a suitable format to represent larger
16
- # entity.
18
+ # This can only be called on sentences and smaller
19
+ # entities, as it is not a suitable format to
20
+ # represent larger entities.
17
21
  def self.visualize(entity, options = {})
18
- options = {:indent => 0} if options.empty?
22
+ options = DefaultOptions.merge(options)
19
23
  value = ''; spaces = ''
20
24
  options[:indent].times { spaces << ' '}
21
25
  options[:indent] += 1
22
26
  if entity.is_a?(Treat::Entities::Token)
23
27
  value += "#{spaces}(#{entity.tag} #{entity.value})"
24
28
  elsif entity.is_a?(Treat::Entities::Constituent)
25
- value += ("#{spaces}(#{entity.tag}\n" +
29
+ tag = entity.has?(:tag) ? entity.tag : ''
30
+ value += ("#{spaces}(#{tag}\n" +
26
31
  "#{Recurse.call(entity, options)})\n")
27
32
  elsif entity.is_a?(Treat::Entities::Sentence)
28
33
  value += ("#{spaces}(S\n" +
@@ -1,11 +1,15 @@
1
1
  module Treat
2
2
  module Formatters
3
3
  module Visualizers
4
+ # This class generates an ASCII representation
5
+ # of a tree of entities.
4
6
  class Tree
7
+ # Default options for the visualizer.
8
+ DefaultOptions = { indent: 0 }
5
9
  # Obtain a plain text tree representation
6
10
  # of the entity.
7
11
  def self.visualize(entity, options = {})
8
- options = {:indent => 0} if options.empty?
12
+ options = DefaultOptions.merge(options)
9
13
  string = ''
10
14
  if entity.has_children?
11
15
  spacer = '--'
@@ -3,10 +3,15 @@ module Treat
3
3
  module Visualizers
4
4
  # Creates a plain text visualization of an entity.
5
5
  class Txt
6
+ # The default options for the visualizer.
7
+ DefaultOptions = { sep: ' ' }
6
8
  # Obtain a plain text visualization of the entity,
7
9
  # with no additional information.
10
+ #
11
+ # Options:
12
+ # (String) :sep => the separator to use between words.
8
13
  def self.visualize(entity, options = {})
9
- options[:sep] = ' '
14
+ options = DefaultOptions.merge(options)
10
15
  return entity.value if !entity.has_children?
11
16
  value = ''
12
17
  entity.each do |child|
@@ -34,7 +34,7 @@ module Treat
34
34
  # Cleaners strip a text from its mark up.
35
35
  module Cleaners
36
36
  extend Group
37
- self.type = :annotator
37
+ self.type = :transformer
38
38
  self.targets = [:document]
39
39
  self.default = :html
40
40
  end
data/lib/treat/group.rb CHANGED
@@ -61,14 +61,15 @@ module Treat
61
61
  end
62
62
  is_target
63
63
  end
64
+ # Cache the list of adaptors to improve performance.
65
+ @@list = {}
64
66
  # Populates once the list of the adaptors in the group
65
67
  # by crawling the filesystem.
66
- @@list = {}
67
68
  def list
68
69
  mod = ucc(cl(self))
69
70
  if @@list[mod].nil?
70
71
  @@list[mod] = []
71
- dirs = Dir["#{File.dirname(__FILE__)}/*/#{mod}/*.rb"] # Fix
72
+ dirs = Dir.glob("#{Treat.lib}/treat/*/#{mod}/*.rb")
72
73
  dirs.each do |file|
73
74
  @@list[mod] <<
74
75
  :"#{file.split('/')[-1][0..-4]}"
@@ -79,7 +80,7 @@ module Treat
79
80
  # Get constants in this module, excluding those
80
81
  # defined by parent modules.
81
82
  def const_get(const); super(const, false); end
82
- # Autoload the algorithms.
83
+ # Lazy load the classes in the group.
83
84
  def const_missing(const)
84
85
  bits = self.ancestors[0].to_s.split('::')
85
86
  bits.collect! { |bit| ucc(bit) }
@@ -1,43 +1,40 @@
1
1
  module Treat
2
2
  module Inflectors
3
3
  module CardinalWords
4
+ # This class is a wrapper for the functions included
5
+ # in the 'linguistics' gem that allow to describe a
6
+ # number in words in cardinal form.
7
+ #
8
+ # Project website: http://deveiate.org/projects/Linguistics/
4
9
  class Linguistics
10
+ # Require the 'linguistics' gem.
5
11
  silence_warnings { require 'linguistics' }
12
+ # Return the description of a cardinal number in words.
6
13
  #
7
14
  # Options:
8
15
  #
9
- # :group => Controls how many numbers at a time are
16
+ # - :group => Controls how many numbers at a time are
10
17
  # grouped together. Valid values are 0 (normal grouping),
11
18
  # 1 (single-digit grouping, e.g., “one, two, three, four”),
12
19
  # 2 (double-digit grouping, e.g., “twelve, thirty-four”, or
13
20
  # 3 (triple-digit grouping, e.g., “one twenty-three, four”).
14
- # :comma => Set the character/s used to separate word groups.
21
+ # - :comma => Set the character/s used to separate word groups.
15
22
  # Defaults to ", ".
16
- # :and => Set the word and/or characters used where ' and '
23
+ # - :and => Set the word and/or characters used where ' and '
17
24
  # (the default) is normally used. Setting :and to ' ', for
18
25
  # example, will cause 2556 to be returned as “two-thousand,
19
26
  # five hundred fifty-six” instead of “two-thousand, five
20
27
  # hundred and fifty-six”.
21
- # :zero => Set the word used to represent the numeral 0 in
28
+ # - :zero => Set the word used to represent the numeral 0 in
22
29
  # the result. 'zero' is the default.
23
- # :decimal => Set the translation of any decimal points in
30
+ # - :decimal => Set the translation of any decimal points in
24
31
  # the number; the default is 'point'.
25
- # :asArray If set to a true value, the number will be returned
32
+ # - :asArray If set to a true value, the number will be returned
26
33
  # as an array of word groups instead of a String.
27
34
  #
28
35
  # More specific options when using :type => :ordinal:
29
- #
30
- #
31
36
  def self.cardinal_words(entity, options = {})
32
- begin
33
- l = entity.language.to_s.upcase
34
- delegate = nil
35
- silence_warnings { delegate = ::Linguistics.const_get(l) }
36
- rescue RuntimeError
37
- raise "Ruby Linguistics does not have a module " +
38
- " installed for the #{entity.language} language."
39
- end
40
- silence_warnings { delegate.numwords(entity.to_s, options) }
37
+ silence_warnings { ::Linguistics::EN.numwords(entity.to_s, options) }
41
38
  end
42
39
  end
43
40
  end
@@ -1,15 +1,28 @@
1
1
  module Treat
2
2
  module Inflectors
3
3
  module Conjugations
4
+ # This class is a wrapper for the functions included
5
+ # in the 'linguistics' gem that allow to conjugate verbs.
6
+ #
7
+ # Project website: http://deveiate.org/projects/Linguistics/
4
8
  class Linguistics
5
9
  silence_warnings { require 'linguistics' }
6
- def self.conjugate(entity, parameters)
10
+ # Conjugate a verb using ruby linguistics with the specified
11
+ # mode, tense, count and person.
12
+ #
13
+ # Options:
14
+ #
15
+ # - (Symbol) :mode => :infinitive, :indicative, :subjunctive, :participle
16
+ # - (Symbol) :tense => :past, :present, :future
17
+ # - (Symbol) :count => :singular, :plural
18
+ # - (Symbol) :person => :first, :second, :third
19
+ def self.conjugations(entity, parameters)
7
20
  begin
8
21
  l = entity.language.to_s.upcase
9
22
  delegate = nil
10
23
  silence_warnings { delegate = ::Linguistics.const_get(l) }
11
24
  rescue RuntimeError
12
- raise "Ruby Linguistics does not have a module " +
25
+ raise "Ruby Linguistics does not have a module " +
13
26
  " installed for the #{entity.language} language."
14
27
  end
15
28
  if parameters[:mode] == :infinitive
@@ -27,4 +40,4 @@ module Treat
27
40
  end
28
41
  end
29
42
  end
30
- end
43
+ end
@@ -1,24 +1,35 @@
1
1
  module Treat
2
2
  module Inflectors
3
3
  module Declensions
4
- silence_warnings { require 'linguistics' }
5
- # Obtain word declensions in English using the
6
- # ruby 'linguistics' gem.
4
+ # This class is a wrapper for the functions included
5
+ # in the 'linguistics' gem that allow to obtain the
6
+ # declensions of a word.
7
+ #
8
+ # Project website: http://deveiate.org/projects/Linguistics/
7
9
  class Linguistics
8
- def self.declense(entity, options = {})
10
+ # Require Ruby Linguistics
11
+ silence_warnings { require 'linguistics' }
12
+ # Retrieve a declension of a word using the 'linguistics' gem.
13
+ #
14
+ # Options:
15
+ #
16
+ # - (Identifier) :count => :singular, :plural
17
+ def self.declensions(entity, options = {})
9
18
  begin
10
19
  l = entity.language.to_s.upcase
11
20
  delegate = nil
12
21
  silence_warnings { delegate = ::Linguistics.const_get(l) }
13
22
  rescue RuntimeError
14
- raise "Ruby Linguistics does not have a module " +
23
+ raise "Ruby Linguistics does not have a module " +
15
24
  " installed for the #{entity.language} language."
16
25
  end
17
26
  string = entity.to_s
18
27
  if options[:count] == :plural
19
28
  if entity.has?(:category) &&
20
29
  [:noun, :adjective, :verb].include?(entity.category)
21
- silence_warnings { delegate.send(:"plural_#{entity.category}", string) }
30
+ silence_warnings do
31
+ delegate.send(:"plural_#{entity.category}", string)
32
+ end
22
33
  else
23
34
  silence_warnings { delegate.plural(string) }
24
35
  end
@@ -1,19 +1,18 @@
1
1
  module Treat
2
2
  module Inflectors
3
3
  module OrdinalWords
4
+ # This class is a wrapper for the functions included
5
+ # in the 'linguistics' gem that allow to describe a
6
+ # number in words in ordinal form.
7
+ #
8
+ # Project website: http://deveiate.org/projects/Linguistics/
4
9
  class Linguistics
10
+ # Require Ruby Linguistics.
5
11
  silence_warnings { require 'linguistics' }
12
+ # Desribe a number in words in ordinal form, using the
13
+ # 'linguistics' gem.
6
14
  def self.ordinal_words(number, options = {})
7
- begin
8
- l = number.language.to_s.upcase
9
- delegate = nil
10
- silence_warnings { delegate = ::Linguistics.const_get(l) }
11
- rescue RuntimeError
12
- lang = Treat::Languages.describe(number.language)
13
- raise "Ruby Linguistics does not have a module " +
14
- " installed for the #{lang} language."
15
- end
16
- silence_warnings { delegate.ordinate(number.to_s) }
15
+ silence_warnings { ::Linguistics::EN.ordinate(number.to_s) }
17
16
  end
18
17
  end
19
18
  end
@@ -2,16 +2,20 @@ module Treat
2
2
  module Inflectors
3
3
  module Stem
4
4
  # Stem a word using a native Ruby implementation of the
5
- # Porter stemming algorithm, ported to Ruby from the
6
- # version coded up in Perl.
5
+ # Porter stemming algorithm, ported to Ruby from a
6
+ # version coded up in Perl. This is a simplified
7
+ # implementation; for a true and fast Porter stemmer,
8
+ # see Treat::Inflectors::Stem::PorterC.
7
9
  #
8
10
  # Authored by Ray Pereda (raypereda@hotmail.com).
11
+ # Unknown license.
9
12
  #
10
13
  # Original paper: Porter, 1980. An algorithm for suffix stripping,
11
14
  # Program, Vol. 14, no. 3, pp 130-137,
12
15
  # Original C implementation: http://www.tartarus.org/~martin/PorterStemmer.
13
16
  class Porter
14
17
  # Returns the stem of a word using a native Porter stemmer.
18
+ #
15
19
  # Options: none.
16
20
  def self.stem(word, options = {})
17
21
  # Copy the word and convert it to a string.
@@ -9,10 +9,13 @@ module Treat
9
9
  # Program, Vol. 14, no. 3, pp 130-137,
10
10
  # Original C implementation: http://www.tartarus.org/~martin/PorterStemmer.
11
11
  class PorterC
12
+ # Require the 'ruby-stemmer' gem.
12
13
  silence_warnings { require 'lingua/stemmer' }
14
+ # Remove a conflict between this gem and the 'engtagger' gem.
13
15
  ::LinguaStemmer = ::Lingua
14
16
  Object.instance_eval { remove_const :Lingua }
15
- # Stem the word using the Porter C algorithm.
17
+ # Stem the word using a full-blown Porter stemmer in C.
18
+ #
16
19
  # Options: none.
17
20
  def self.stem(word, options = {})
18
21
  silence_warnings { ::LinguaStemmer.stemmer(word.to_s) }
@@ -9,10 +9,10 @@ module Treat
9
9
  # groups of rules: the first to clean the tokens, and
10
10
  # the second to alter suffixes."
11
11
  #
12
- # Project website: https://github.com/ealdent/uea-stemmer
13
- # Original paper: Jenkins, Marie-Claire, Smith, Dan,
14
- # Conservative stemming for search and indexing, 2005.
15
- # http://www.uea.ac.uk/polopoly_fs/1.85493!stemmer25feb.pdf
12
+ # Project website: https://github.com/ealdent/uea-stemmer
13
+ # Original paper: Jenkins, Marie-Claire, Smith, Dan,
14
+ # Conservative stemming for search and indexing, 2005.
15
+ # http://www.uea.ac.uk/polopoly_fs/1.85493!stemmer25feb.pdf
16
16
  class UEA
17
17
  # Require the 'uea-stemmer' gem.
18
18
  silence_warnings { require 'uea-stemmer' }
@@ -183,6 +183,22 @@ module Treat
183
183
  ['PRT', 'Particle'],
184
184
  ['S', 'Sentence']
185
185
  ]
186
+
187
+ # Maps Enju categories to Treat categories.
188
+ EnjuCatToCategory = {
189
+ 'ADJ' => :adjective,
190
+ 'ADV' => :adverb,
191
+ 'CONJ' => :conjunction,
192
+ 'COOD' => :conjunction,
193
+ 'C' => :complementizer,
194
+ 'D' => :determiner,
195
+ 'N' => :noun,
196
+ 'P' => :preposition,
197
+ 'PN' => :punctuation,
198
+ 'SC' => :conjunction,
199
+ 'V' => :verb,
200
+ 'PRT' => :particle
201
+ }
186
202
 
187
203
  # Description of the xcat in the Enju output specification.
188
204
  EnjuXCatDescription = [
@@ -1,8 +1,10 @@
1
1
  module Treat
2
2
  module Languages
3
3
  class English
4
+
4
5
  require 'treat/languages/english/tags'
5
6
  require 'treat/languages/english/categories'
7
+
6
8
  Extractors = {
7
9
  time: [:chronic],
8
10
  topics: [:reuters],
@@ -11,7 +13,7 @@ module Treat
11
13
  }
12
14
  Processors = {
13
15
  chunkers: [:txt],
14
- parsers: [:enju, :stanford],
16
+ parsers: [:stanford, :enju],
15
17
  segmenters: [:tactful, :punkt, :stanford],
16
18
  tokenizers: [:multilingual, :macintyre, :perl, :punkt, :tactful, :stanford]
17
19
  }
@@ -28,6 +30,7 @@ module Treat
28
30
  ordinal_words: [:linguistics],
29
31
  cardinal_words: [:linguistics]
30
32
  }
33
+
31
34
  end
32
35
  end
33
36
  end
@@ -4,13 +4,12 @@ module Treat
4
4
  # A class that detects the category of a word from its tag,
5
5
  # using the default tagger for the language of the entity.
6
6
  class FromTag
7
- DefaultOptions = { tagger: nil }
8
7
  # Find the category of the current entity.
8
+ #
9
9
  # Options:
10
- # :tagger => (Symbol) force the use of a tagger.
11
- # :tag_to_cat => (Hash) a list of categories for each possible tag.
10
+ #
11
+ # - (Symbol) :tagger => force the use of a tagger.
12
12
  def self.category(entity, options = {})
13
- options = DefaultOptions.merge(options)
14
13
  tag = options[:tagger].nil? ? entity.tag : entity.tag(options[:tagger])
15
14
  lang = Treat::Languages.get(entity.language)
16
15
  cat = lang::WordTagToCategory[tag]
@@ -21,6 +20,7 @@ module Treat
21
20
  if cat.size == 1
22
21
  return cat[0]
23
22
  else
23
+ entity.set :tag_set, :penn
24
24
  if entity.has?(:tag_set)
25
25
  if cat[entity.tag_set]
26
26
  return cat[entity.tag_set]
@@ -27,7 +27,7 @@ module Treat
27
27
  end
28
28
  # Return the subject of the sentence|verb.
29
29
  def self.subject(entity, options)
30
- verb = entity.category == :verb ?
30
+ verb = (entity.has?(:category) && entity.category == :verb) ?
31
31
  main_verb(entity) : entity.main_verb
32
32
  args = []
33
33
  main_verb.edges.each_pair do |id,edge|
@@ -37,7 +37,7 @@ module Treat
37
37
  end
38
38
  # Return the object of the sentence|verb.
39
39
  def self.object(entity, options)
40
- verb = entity.category == :verb ?
40
+ verb = (entity.has?(:category) && entity.category == :verb) ?
41
41
  main_verb(entity) : entity.main_verb
42
42
  if verb.voice == 'passive'
43
43
  return
@@ -50,7 +50,7 @@ module Treat
50
50
  end
51
51
  # Find the main verb (shallowest verb in the tree).
52
52
  def self.main_verb(entity, options)
53
- verbs = entity.words_with_cat(:verb)
53
+ verbs = entity.verbs
54
54
  if verbs.empty?
55
55
  return
56
56
  end
@@ -52,24 +52,16 @@ module Treat
52
52
  @@tagger = nil
53
53
  # Hold the user-set options
54
54
  @@options = {}
55
- # Hold the default options.
56
- DefaultOptions = {
57
- lexicon: nil,
58
- lexical_rules: nil,
59
- contextual_rules: nil
60
- }
61
55
  # Tag words using a native Brill tagger.
62
56
  #
63
- # Available options:
57
+ # Options:
58
+ #
64
59
  # :lexicon => String (Lexicon file to use)
65
60
  # :lexical_rules => String (Lexical rule file to use)
66
61
  # :contextual_rules => String (Contextual rules file to use)
67
62
  def self.tag(entity, options = {})
68
63
  # Reinitialize the tagger if the options have changed.
69
- if options != @@options
70
- @@options = DefaultOptions.merge(options)
71
- @@tagger = nil # Reset the tagger
72
- end
64
+ @@tagger = nil if options != @@options
73
65
  # Create the tagger if necessary
74
66
  @@tagger ||= ::Brill::Tagger.new(options[:lexicon],
75
67
  options[:lexical_rules], options[:contextual_rules])
@@ -24,9 +24,8 @@ module Treat
24
24
  @@options = {}
25
25
  # Hold the default options.
26
26
  DefaultOptions = {
27
- unknown_word_tag: '?',
28
- relax: false,
29
- debug: false
27
+ unknown_word_tag: 'FW',
28
+ relax: false
30
29
  }
31
30
  # Tag the word using a probabilistic model taking
32
31
  # into account known words found in a lexicon and
@@ -34,11 +33,10 @@ module Treat
34
33
  #
35
34
  # Options:
36
35
  #
37
- # :relax => (Boolean) Relax the Hidden Markov Model:
36
+ # - (Boolean) :relax => Relax the Hidden Markov Model:
38
37
  # this may improve accuracy for uncommon words,
39
38
  # particularly words used polysemously.
40
- # :debug => (Boolean) Print debug messages.
41
- # :unknown_word_tag => (String) Tag for unknown words.
39
+ # - (String) :unknown_word_tag => Tag for unknown words.
42
40
  def self.tag(entity, options = {})
43
41
  # Reinitialize the tagger if the options have changed.
44
42
  if options != @@options