treat 0.1.2 → 0.1.3

Sign up to get free protection for your applications and to get access to all the features.
Files changed (100) hide show
  1. data/LICENSE +7 -8
  2. data/TODO +16 -13
  3. data/examples/keywords.rb +89 -1
  4. data/lib/treat/buildable.rb +1 -8
  5. data/lib/treat/categories.rb +3 -4
  6. data/lib/treat/category.rb +1 -1
  7. data/lib/treat/delegatable.rb +1 -1
  8. data/lib/treat/detectors/encoding/native.rb +5 -0
  9. data/lib/treat/detectors/encoding/r_chardet19.rb +2 -3
  10. data/lib/treat/detectors/language/language_detector.rb +4 -0
  11. data/lib/treat/detectors/language/what_language.rb +4 -4
  12. data/lib/treat/detectors.rb +1 -1
  13. data/lib/treat/entities/entity.rb +5 -3
  14. data/lib/treat/entities/tokens.rb +14 -5
  15. data/lib/treat/entities/zones.rb +4 -0
  16. data/lib/treat/entities.rb +7 -5
  17. data/lib/treat/extractors/keywords/topics_frequency.rb +40 -0
  18. data/lib/treat/extractors/statistics/{frequency.rb → frequency_in.rb} +5 -4
  19. data/lib/treat/extractors/statistics/frequency_of.rb +3 -5
  20. data/lib/treat/extractors/statistics/{position_in.rb → position_in_parent.rb} +4 -3
  21. data/lib/treat/extractors/statistics/tf_idf.rb +36 -0
  22. data/lib/treat/extractors/statistics/transition_matrix.rb +25 -25
  23. data/lib/treat/extractors/statistics/transition_probability.rb +8 -4
  24. data/lib/treat/extractors/time/chronic.rb +8 -0
  25. data/lib/treat/extractors/time/native.rb +6 -0
  26. data/lib/treat/extractors/time/nickel.rb +31 -23
  27. data/lib/treat/extractors/topic_words/lda.rb +21 -16
  28. data/lib/treat/extractors/topics/reuters.rb +6 -4
  29. data/lib/treat/extractors.rb +7 -7
  30. data/lib/treat/formatters/readers/abw.rb +32 -0
  31. data/lib/treat/formatters/readers/autoselect.rb +13 -11
  32. data/lib/treat/formatters/readers/doc.rb +13 -0
  33. data/lib/treat/formatters/readers/gocr.rb +2 -0
  34. data/lib/treat/formatters/readers/html.rb +21 -1
  35. data/lib/treat/formatters/readers/ocropus.rb +3 -3
  36. data/lib/treat/formatters/readers/odt.rb +41 -0
  37. data/lib/treat/formatters/readers/pdf.rb +5 -2
  38. data/lib/treat/formatters/readers/txt.rb +2 -0
  39. data/lib/treat/formatters/serializers/xml.rb +3 -2
  40. data/lib/treat/formatters/serializers/yaml.rb +2 -0
  41. data/lib/treat/formatters/unserializers/autoselect.rb +7 -1
  42. data/lib/treat/formatters/unserializers/xml.rb +6 -1
  43. data/lib/treat/formatters/unserializers/yaml.rb +5 -1
  44. data/lib/treat/formatters/visualizers/dot.rb +35 -37
  45. data/lib/treat/formatters/visualizers/html.rb +1 -0
  46. data/lib/treat/formatters/visualizers/inspect.rb +4 -0
  47. data/lib/treat/formatters/visualizers/short_value.rb +18 -3
  48. data/lib/treat/formatters/visualizers/standoff.rb +11 -6
  49. data/lib/treat/formatters/visualizers/tree.rb +5 -1
  50. data/lib/treat/formatters/visualizers/txt.rb +6 -1
  51. data/lib/treat/formatters.rb +1 -1
  52. data/lib/treat/group.rb +4 -3
  53. data/lib/treat/inflectors/cardinal_words/linguistics.rb +14 -17
  54. data/lib/treat/inflectors/conjugations/linguistics.rb +16 -3
  55. data/lib/treat/inflectors/declensions/linguistics.rb +17 -6
  56. data/lib/treat/inflectors/ordinal_words/linguistics.rb +9 -10
  57. data/lib/treat/inflectors/stem/porter.rb +6 -2
  58. data/lib/treat/inflectors/stem/porter_c.rb +4 -1
  59. data/lib/treat/inflectors/stem/uea.rb +4 -4
  60. data/lib/treat/languages/english/tags.rb +16 -0
  61. data/lib/treat/languages/english.rb +4 -1
  62. data/lib/treat/lexicalizers/category/from_tag.rb +4 -4
  63. data/lib/treat/lexicalizers/linkages/naive.rb +3 -3
  64. data/lib/treat/lexicalizers/tag/brill.rb +3 -11
  65. data/lib/treat/lexicalizers/tag/lingua.rb +4 -6
  66. data/lib/treat/lexicalizers.rb +0 -2
  67. data/lib/treat/processors/chunkers/txt.rb +4 -4
  68. data/lib/treat/processors/parsers/enju.rb +3 -17
  69. data/lib/treat/processors/parsers/stanford.rb +4 -0
  70. data/lib/treat/processors/segmenters/punkt.rb +1 -0
  71. data/lib/treat/processors/segmenters/stanford.rb +4 -0
  72. data/lib/treat/processors/segmenters/tactful.rb +4 -1
  73. data/lib/treat/processors/tokenizers/punkt.rb +1 -2
  74. data/lib/treat/processors/tokenizers/stanford.rb +4 -0
  75. data/lib/treat/processors/tokenizers/tactful.rb +1 -1
  76. data/lib/treat/processors.rb +4 -4
  77. data/lib/treat/proxies.rb +18 -11
  78. data/lib/treat/registrable.rb +12 -5
  79. data/lib/treat/sugar.rb +8 -3
  80. data/lib/treat/tree.rb +10 -3
  81. data/lib/treat.rb +55 -55
  82. data/test/tc_entity.rb +7 -7
  83. data/test/tc_extractors.rb +6 -4
  84. data/test/tc_formatters.rb +0 -4
  85. data/test/tests.rb +2 -0
  86. data/test/texts.rb +4 -4
  87. metadata +48 -56
  88. data/examples/texts/bugged_out.txt +0 -26
  89. data/examples/texts/half_cocked_basel.txt +0 -16
  90. data/examples/texts/hedge_funds.txt +0 -24
  91. data/examples/texts/hose_and_dry.txt +0 -19
  92. data/examples/texts/hungarys_troubles.txt +0 -46
  93. data/examples/texts/indias_slowdown.txt +0 -15
  94. data/examples/texts/merkozy_rides_again.txt +0 -24
  95. data/examples/texts/prada_is_not_walmart.txt +0 -9
  96. data/examples/texts/republican_nomination.txt +0 -26
  97. data/examples/texts/to_infinity_and_beyond.txt +0 -15
  98. data/lib/treat/entities/text.rb +0 -7
  99. data/lib/treat/extractors/key_sentences/topics_frequency.rb +0 -49
  100. data/lib/treat/formatters/cleaners/html.rb +0 -17
@@ -2,62 +2,54 @@ module Treat
2
2
  module Formatters
3
3
  module Visualizers
4
4
  class Dot
5
- # Border colors to use for different POS tags.
6
- BorderColors = {
7
- :verb => "#00AABB",
8
- :noun => "#FAD4A7",
9
- :adverb => '#103585',
10
- :adjective => '#D21D54'
11
- }
5
+ DefaultOptions = {colors: {}, :features => :all}
12
6
  # Create the top-most graph structure
13
7
  # and delegate the creation of the graph
14
8
  # nodes to to_dot.
15
9
  def self.visualize(entity, options = {})
10
+ options = DefaultOptions.merge(options)
16
11
  string = "graph {"
17
- string << self.to_dot(entity)
12
+ string << self.to_dot(entity, options)
18
13
  string << "\n}"
19
14
  end
20
15
  # dot -Tpdf test4.dot > test4.pdf
21
- def self.to_dot(entity)
16
+ def self.to_dot(entity, options)
17
+ # Id
22
18
  string = ''
23
- if entity.is_leaf?
24
- if entity.is_a?(Treat::Entities::Word)
25
- label = "label=\"#{entity.value} (#{entity.tag})\","
26
- label << "color=\"#{BorderColors[entity.cat]}\","
27
- else
28
- label = "label=\"#{entity.value.inspect[1..-2]}\","
29
- end
19
+ label = ''
20
+ string = "\n#{entity.id} ["
21
+ # Value
22
+ if entity.is_a?(Treat::Entities::Token)
23
+ label = entity.to_s
30
24
  else
31
- if entity.class < Entities::Constituent
32
- label = "label=\"#{entity.tag}\","
33
- # label << "color=\"#{BorderColors[entity.tag]}\","
34
- else
35
- label = "label=\"#{cc(cl(entity.class))}\","
25
+ label = entity.type.to_s.capitalize + " "
26
+ if entity.is_leaf?
27
+ label = entity.short_value.gsub(' [...]', " [...] \\n")
36
28
  end
37
29
  end
38
- string << "\n#{entity.id} ["
30
+ # Features
39
31
  if entity.has_features?
40
- string << label
41
- entity.features.each_pair do |feature, value|
42
- if value.is_a?(Treat::Entities::Entity)
43
- string << "#{feature}=\"#{value.id}\","
44
- else
45
- string << "#{feature}=\"#{value}\","
32
+ unless options[:features] == :none
33
+ label << "\\n"
34
+ entity.features.each do |feature, value|
35
+ if options[:features] == :all ||
36
+ options[:features].include?(feature)
37
+ if value.is_a?(Treat::Entities::Entity)
38
+ label << "\\n#{feature}=\\\"*#{value.id}\\\","
39
+ else
40
+ label << "\\n#{feature}=\\\"#{value}\\\","
41
+ end
42
+ end
46
43
  end
47
44
  end
48
- string = string[0..-2]
49
- string << "]"
50
- else
51
- string << "#{label[0..-2]}]"
52
45
  end
46
+ label = label[0..-2] if label[-1] == ','
47
+ string << "label=\"#{label}\"]"
48
+ # Parent-child relationships.
53
49
  if entity.has_parent?
54
50
  string << "\n#{entity.parent.id} -- #{entity.id};"
55
51
  end
56
- if entity.has_children?
57
- entity.each do |child|
58
- string << self.to_dot(child)
59
- end
60
- end
52
+ # Edges.
61
53
  if entity.has_edges?
62
54
  entity.edges.each_pair do |target, type|
63
55
  string << "\n#{entity.id} -- #{target}"
@@ -65,6 +57,12 @@ module Treat
65
57
  string << "arrowhead=\"odiamond\"]"
66
58
  end
67
59
  end
60
+ # Recurse.
61
+ if entity.has_children?
62
+ entity.each do |child|
63
+ string << self.to_dot(child, options)
64
+ end
65
+ end
68
66
  string
69
67
  end
70
68
  end
@@ -5,6 +5,7 @@ module Treat
5
5
  class HTML
6
6
  # Not implemented yet.
7
7
  def self.visualize(entity, options = {})
8
+ raise 'Not implemented yet.'
8
9
  end
9
10
  end
10
11
  end
@@ -1,7 +1,11 @@
1
1
  module Treat
2
2
  module Formatters
3
3
  module Visualizers
4
+ # Handles the call to inspect.
4
5
  class Inspect
6
+ # Return a terminal-friendly visualization of an entity.
7
+ #
8
+ # Options: none.
5
9
  def self.visualize(entity, options = {})
6
10
  s = "#{entity.class.to_s.split('::')[-1]} (#{entity.id.to_s})"
7
11
  unless caller_method == :inspect
@@ -2,11 +2,26 @@ module Treat
2
2
  module Formatters
3
3
  module Visualizers
4
4
  class ShortValue
5
+ # Default options for the visualizer.
6
+ DefaultOptions = { max_words: 6, max_length: 30 }
7
+ # Returns the text value of an entity, shortend
8
+ # with [..] if the value is longer than :max_words
9
+ # or longer than :max_length.
10
+ #
11
+ # Options:
12
+ # - (Integer) :max_words => the maximum number
13
+ # of words in an entity before it is shortened.
14
+ # - (Integer) :max_length => the maximum number
15
+ # of characters in an entity before it is shortened.s
5
16
  def self.visualize(entity, options = {})
6
- options[:max_length] ||= 6
17
+ options = DefaultOptions.merge(options)
7
18
  words = entity.to_s.split(' ')
8
- return entity.to_s if words.size < options[:max_length]
9
- words[0..2].join(' ') + ' [...] ' + words[-3..-1].join(' ')
19
+ if words.size < options[:max_words] ||
20
+ entity.to_s.length < options[:max_length]
21
+ entity.to_s
22
+ else
23
+ words[0..2].join(' ') + ' [...] ' + words[-3..-1].join(' ')
24
+ end
10
25
  end
11
26
  end
12
27
  end
@@ -5,24 +5,29 @@ module Treat
5
5
  # an entity in standoff format; for example:
6
6
  # (S (NP John) (VP has (VP come))).
7
7
  class Standoff
8
- Recurse = Proc.new do |entity, options|
8
+ # Default options for the visualizer.
9
+ DefaultOptions = { indent: 0 }
10
+ # A lambda to recursively visualize the children
11
+ # of an entity.
12
+ Recurse = lambda do |entity, options|
9
13
  v = ''
10
14
  entity.each { |child| v += visualize(child, options) }
11
15
  v
12
16
  end
13
17
  # Visualize the entity using standoff notation.
14
- # This can only be called on sentences, as it
15
- # is not a suitable format to represent larger
16
- # entity.
18
+ # This can only be called on sentences and smaller
19
+ # entities, as it is not a suitable format to
20
+ # represent larger entities.
17
21
  def self.visualize(entity, options = {})
18
- options = {:indent => 0} if options.empty?
22
+ options = DefaultOptions.merge(options)
19
23
  value = ''; spaces = ''
20
24
  options[:indent].times { spaces << ' '}
21
25
  options[:indent] += 1
22
26
  if entity.is_a?(Treat::Entities::Token)
23
27
  value += "#{spaces}(#{entity.tag} #{entity.value})"
24
28
  elsif entity.is_a?(Treat::Entities::Constituent)
25
- value += ("#{spaces}(#{entity.tag}\n" +
29
+ tag = entity.has?(:tag) ? entity.tag : ''
30
+ value += ("#{spaces}(#{tag}\n" +
26
31
  "#{Recurse.call(entity, options)})\n")
27
32
  elsif entity.is_a?(Treat::Entities::Sentence)
28
33
  value += ("#{spaces}(S\n" +
@@ -1,11 +1,15 @@
1
1
  module Treat
2
2
  module Formatters
3
3
  module Visualizers
4
+ # This class generates an ASCII representation
5
+ # of a tree of entities.
4
6
  class Tree
7
+ # Default options for the visualizer.
8
+ DefaultOptions = { indent: 0 }
5
9
  # Obtain a plain text tree representation
6
10
  # of the entity.
7
11
  def self.visualize(entity, options = {})
8
- options = {:indent => 0} if options.empty?
12
+ options = DefaultOptions.merge(options)
9
13
  string = ''
10
14
  if entity.has_children?
11
15
  spacer = '--'
@@ -3,10 +3,15 @@ module Treat
3
3
  module Visualizers
4
4
  # Creates a plain text visualization of an entity.
5
5
  class Txt
6
+ # The default options for the visualizer.
7
+ DefaultOptions = { sep: ' ' }
6
8
  # Obtain a plain text visualization of the entity,
7
9
  # with no additional information.
10
+ #
11
+ # Options:
12
+ # (String) :sep => the separator to use between words.
8
13
  def self.visualize(entity, options = {})
9
- options[:sep] = ' '
14
+ options = DefaultOptions.merge(options)
10
15
  return entity.value if !entity.has_children?
11
16
  value = ''
12
17
  entity.each do |child|
@@ -34,7 +34,7 @@ module Treat
34
34
  # Cleaners strip a text from its mark up.
35
35
  module Cleaners
36
36
  extend Group
37
- self.type = :annotator
37
+ self.type = :transformer
38
38
  self.targets = [:document]
39
39
  self.default = :html
40
40
  end
data/lib/treat/group.rb CHANGED
@@ -61,14 +61,15 @@ module Treat
61
61
  end
62
62
  is_target
63
63
  end
64
+ # Cache the list of adaptors to improve performance.
65
+ @@list = {}
64
66
  # Populates once the list of the adaptors in the group
65
67
  # by crawling the filesystem.
66
- @@list = {}
67
68
  def list
68
69
  mod = ucc(cl(self))
69
70
  if @@list[mod].nil?
70
71
  @@list[mod] = []
71
- dirs = Dir["#{File.dirname(__FILE__)}/*/#{mod}/*.rb"] # Fix
72
+ dirs = Dir.glob("#{Treat.lib}/treat/*/#{mod}/*.rb")
72
73
  dirs.each do |file|
73
74
  @@list[mod] <<
74
75
  :"#{file.split('/')[-1][0..-4]}"
@@ -79,7 +80,7 @@ module Treat
79
80
  # Get constants in this module, excluding those
80
81
  # defined by parent modules.
81
82
  def const_get(const); super(const, false); end
82
- # Autoload the algorithms.
83
+ # Lazy load the classes in the group.
83
84
  def const_missing(const)
84
85
  bits = self.ancestors[0].to_s.split('::')
85
86
  bits.collect! { |bit| ucc(bit) }
@@ -1,43 +1,40 @@
1
1
  module Treat
2
2
  module Inflectors
3
3
  module CardinalWords
4
+ # This class is a wrapper for the functions included
5
+ # in the 'linguistics' gem that allow to describe a
6
+ # number in words in cardinal form.
7
+ #
8
+ # Project website: http://deveiate.org/projects/Linguistics/
4
9
  class Linguistics
10
+ # Require the 'linguistics' gem.
5
11
  silence_warnings { require 'linguistics' }
12
+ # Return the description of a cardinal number in words.
6
13
  #
7
14
  # Options:
8
15
  #
9
- # :group => Controls how many numbers at a time are
16
+ # - :group => Controls how many numbers at a time are
10
17
  # grouped together. Valid values are 0 (normal grouping),
11
18
  # 1 (single-digit grouping, e.g., “one, two, three, four”),
12
19
  # 2 (double-digit grouping, e.g., “twelve, thirty-four”, or
13
20
  # 3 (triple-digit grouping, e.g., “one twenty-three, four”).
14
- # :comma => Set the character/s used to separate word groups.
21
+ # - :comma => Set the character/s used to separate word groups.
15
22
  # Defaults to ", ".
16
- # :and => Set the word and/or characters used where ' and '
23
+ # - :and => Set the word and/or characters used where ' and '
17
24
  # (the default) is normally used. Setting :and to ' ', for
18
25
  # example, will cause 2556 to be returned as “two-thousand,
19
26
  # five hundred fifty-six” instead of “two-thousand, five
20
27
  # hundred and fifty-six”.
21
- # :zero => Set the word used to represent the numeral 0 in
28
+ # - :zero => Set the word used to represent the numeral 0 in
22
29
  # the result. 'zero' is the default.
23
- # :decimal => Set the translation of any decimal points in
30
+ # - :decimal => Set the translation of any decimal points in
24
31
  # the number; the default is 'point'.
25
- # :asArray If set to a true value, the number will be returned
32
+ # - :asArray If set to a true value, the number will be returned
26
33
  # as an array of word groups instead of a String.
27
34
  #
28
35
  # More specific options when using :type => :ordinal:
29
- #
30
- #
31
36
  def self.cardinal_words(entity, options = {})
32
- begin
33
- l = entity.language.to_s.upcase
34
- delegate = nil
35
- silence_warnings { delegate = ::Linguistics.const_get(l) }
36
- rescue RuntimeError
37
- raise "Ruby Linguistics does not have a module " +
38
- " installed for the #{entity.language} language."
39
- end
40
- silence_warnings { delegate.numwords(entity.to_s, options) }
37
+ silence_warnings { ::Linguistics::EN.numwords(entity.to_s, options) }
41
38
  end
42
39
  end
43
40
  end
@@ -1,15 +1,28 @@
1
1
  module Treat
2
2
  module Inflectors
3
3
  module Conjugations
4
+ # This class is a wrapper for the functions included
5
+ # in the 'linguistics' gem that allow to conjugate verbs.
6
+ #
7
+ # Project website: http://deveiate.org/projects/Linguistics/
4
8
  class Linguistics
5
9
  silence_warnings { require 'linguistics' }
6
- def self.conjugate(entity, parameters)
10
+ # Conjugate a verb using ruby linguistics with the specified
11
+ # mode, tense, count and person.
12
+ #
13
+ # Options:
14
+ #
15
+ # - (Symbol) :mode => :infinitive, :indicative, :subjunctive, :participle
16
+ # - (Symbol) :tense => :past, :present, :future
17
+ # - (Symbol) :count => :singular, :plural
18
+ # - (Symbol) :person => :first, :second, :third
19
+ def self.conjugations(entity, parameters)
7
20
  begin
8
21
  l = entity.language.to_s.upcase
9
22
  delegate = nil
10
23
  silence_warnings { delegate = ::Linguistics.const_get(l) }
11
24
  rescue RuntimeError
12
- raise "Ruby Linguistics does not have a module " +
25
+ raise "Ruby Linguistics does not have a module " +
13
26
  " installed for the #{entity.language} language."
14
27
  end
15
28
  if parameters[:mode] == :infinitive
@@ -27,4 +40,4 @@ module Treat
27
40
  end
28
41
  end
29
42
  end
30
- end
43
+ end
@@ -1,24 +1,35 @@
1
1
  module Treat
2
2
  module Inflectors
3
3
  module Declensions
4
- silence_warnings { require 'linguistics' }
5
- # Obtain word declensions in English using the
6
- # ruby 'linguistics' gem.
4
+ # This class is a wrapper for the functions included
5
+ # in the 'linguistics' gem that allow to obtain the
6
+ # declensions of a word.
7
+ #
8
+ # Project website: http://deveiate.org/projects/Linguistics/
7
9
  class Linguistics
8
- def self.declense(entity, options = {})
10
+ # Require Ruby Linguistics
11
+ silence_warnings { require 'linguistics' }
12
+ # Retrieve a declension of a word using the 'linguistics' gem.
13
+ #
14
+ # Options:
15
+ #
16
+ # - (Identifier) :count => :singular, :plural
17
+ def self.declensions(entity, options = {})
9
18
  begin
10
19
  l = entity.language.to_s.upcase
11
20
  delegate = nil
12
21
  silence_warnings { delegate = ::Linguistics.const_get(l) }
13
22
  rescue RuntimeError
14
- raise "Ruby Linguistics does not have a module " +
23
+ raise "Ruby Linguistics does not have a module " +
15
24
  " installed for the #{entity.language} language."
16
25
  end
17
26
  string = entity.to_s
18
27
  if options[:count] == :plural
19
28
  if entity.has?(:category) &&
20
29
  [:noun, :adjective, :verb].include?(entity.category)
21
- silence_warnings { delegate.send(:"plural_#{entity.category}", string) }
30
+ silence_warnings do
31
+ delegate.send(:"plural_#{entity.category}", string)
32
+ end
22
33
  else
23
34
  silence_warnings { delegate.plural(string) }
24
35
  end
@@ -1,19 +1,18 @@
1
1
  module Treat
2
2
  module Inflectors
3
3
  module OrdinalWords
4
+ # This class is a wrapper for the functions included
5
+ # in the 'linguistics' gem that allow to describe a
6
+ # number in words in ordinal form.
7
+ #
8
+ # Project website: http://deveiate.org/projects/Linguistics/
4
9
  class Linguistics
10
+ # Require Ruby Linguistics.
5
11
  silence_warnings { require 'linguistics' }
12
+ # Desribe a number in words in ordinal form, using the
13
+ # 'linguistics' gem.
6
14
  def self.ordinal_words(number, options = {})
7
- begin
8
- l = number.language.to_s.upcase
9
- delegate = nil
10
- silence_warnings { delegate = ::Linguistics.const_get(l) }
11
- rescue RuntimeError
12
- lang = Treat::Languages.describe(number.language)
13
- raise "Ruby Linguistics does not have a module " +
14
- " installed for the #{lang} language."
15
- end
16
- silence_warnings { delegate.ordinate(number.to_s) }
15
+ silence_warnings { ::Linguistics::EN.ordinate(number.to_s) }
17
16
  end
18
17
  end
19
18
  end
@@ -2,16 +2,20 @@ module Treat
2
2
  module Inflectors
3
3
  module Stem
4
4
  # Stem a word using a native Ruby implementation of the
5
- # Porter stemming algorithm, ported to Ruby from the
6
- # version coded up in Perl.
5
+ # Porter stemming algorithm, ported to Ruby from a
6
+ # version coded up in Perl. This is a simplified
7
+ # implementation; for a true and fast Porter stemmer,
8
+ # see Treat::Inflectors::Stem::PorterC.
7
9
  #
8
10
  # Authored by Ray Pereda (raypereda@hotmail.com).
11
+ # Unknown license.
9
12
  #
10
13
  # Original paper: Porter, 1980. An algorithm for suffix stripping,
11
14
  # Program, Vol. 14, no. 3, pp 130-137,
12
15
  # Original C implementation: http://www.tartarus.org/~martin/PorterStemmer.
13
16
  class Porter
14
17
  # Returns the stem of a word using a native Porter stemmer.
18
+ #
15
19
  # Options: none.
16
20
  def self.stem(word, options = {})
17
21
  # Copy the word and convert it to a string.
@@ -9,10 +9,13 @@ module Treat
9
9
  # Program, Vol. 14, no. 3, pp 130-137,
10
10
  # Original C implementation: http://www.tartarus.org/~martin/PorterStemmer.
11
11
  class PorterC
12
+ # Require the 'ruby-stemmer' gem.
12
13
  silence_warnings { require 'lingua/stemmer' }
14
+ # Remove a conflict between this gem and the 'engtagger' gem.
13
15
  ::LinguaStemmer = ::Lingua
14
16
  Object.instance_eval { remove_const :Lingua }
15
- # Stem the word using the Porter C algorithm.
17
+ # Stem the word using a full-blown Porter stemmer in C.
18
+ #
16
19
  # Options: none.
17
20
  def self.stem(word, options = {})
18
21
  silence_warnings { ::LinguaStemmer.stemmer(word.to_s) }
@@ -9,10 +9,10 @@ module Treat
9
9
  # groups of rules: the first to clean the tokens, and
10
10
  # the second to alter suffixes."
11
11
  #
12
- # Project website: https://github.com/ealdent/uea-stemmer
13
- # Original paper: Jenkins, Marie-Claire, Smith, Dan,
14
- # Conservative stemming for search and indexing, 2005.
15
- # http://www.uea.ac.uk/polopoly_fs/1.85493!stemmer25feb.pdf
12
+ # Project website: https://github.com/ealdent/uea-stemmer
13
+ # Original paper: Jenkins, Marie-Claire, Smith, Dan,
14
+ # Conservative stemming for search and indexing, 2005.
15
+ # http://www.uea.ac.uk/polopoly_fs/1.85493!stemmer25feb.pdf
16
16
  class UEA
17
17
  # Require the 'uea-stemmer' gem.
18
18
  silence_warnings { require 'uea-stemmer' }
@@ -183,6 +183,22 @@ module Treat
183
183
  ['PRT', 'Particle'],
184
184
  ['S', 'Sentence']
185
185
  ]
186
+
187
+ # Maps Enju categories to Treat categories.
188
+ EnjuCatToCategory = {
189
+ 'ADJ' => :adjective,
190
+ 'ADV' => :adverb,
191
+ 'CONJ' => :conjunction,
192
+ 'COOD' => :conjunction,
193
+ 'C' => :complementizer,
194
+ 'D' => :determiner,
195
+ 'N' => :noun,
196
+ 'P' => :preposition,
197
+ 'PN' => :punctuation,
198
+ 'SC' => :conjunction,
199
+ 'V' => :verb,
200
+ 'PRT' => :particle
201
+ }
186
202
 
187
203
  # Description of the xcat in the Enju output specification.
188
204
  EnjuXCatDescription = [
@@ -1,8 +1,10 @@
1
1
  module Treat
2
2
  module Languages
3
3
  class English
4
+
4
5
  require 'treat/languages/english/tags'
5
6
  require 'treat/languages/english/categories'
7
+
6
8
  Extractors = {
7
9
  time: [:chronic],
8
10
  topics: [:reuters],
@@ -11,7 +13,7 @@ module Treat
11
13
  }
12
14
  Processors = {
13
15
  chunkers: [:txt],
14
- parsers: [:enju, :stanford],
16
+ parsers: [:stanford, :enju],
15
17
  segmenters: [:tactful, :punkt, :stanford],
16
18
  tokenizers: [:multilingual, :macintyre, :perl, :punkt, :tactful, :stanford]
17
19
  }
@@ -28,6 +30,7 @@ module Treat
28
30
  ordinal_words: [:linguistics],
29
31
  cardinal_words: [:linguistics]
30
32
  }
33
+
31
34
  end
32
35
  end
33
36
  end
@@ -4,13 +4,12 @@ module Treat
4
4
  # A class that detects the category of a word from its tag,
5
5
  # using the default tagger for the language of the entity.
6
6
  class FromTag
7
- DefaultOptions = { tagger: nil }
8
7
  # Find the category of the current entity.
8
+ #
9
9
  # Options:
10
- # :tagger => (Symbol) force the use of a tagger.
11
- # :tag_to_cat => (Hash) a list of categories for each possible tag.
10
+ #
11
+ # - (Symbol) :tagger => force the use of a tagger.
12
12
  def self.category(entity, options = {})
13
- options = DefaultOptions.merge(options)
14
13
  tag = options[:tagger].nil? ? entity.tag : entity.tag(options[:tagger])
15
14
  lang = Treat::Languages.get(entity.language)
16
15
  cat = lang::WordTagToCategory[tag]
@@ -21,6 +20,7 @@ module Treat
21
20
  if cat.size == 1
22
21
  return cat[0]
23
22
  else
23
+ entity.set :tag_set, :penn
24
24
  if entity.has?(:tag_set)
25
25
  if cat[entity.tag_set]
26
26
  return cat[entity.tag_set]
@@ -27,7 +27,7 @@ module Treat
27
27
  end
28
28
  # Return the subject of the sentence|verb.
29
29
  def self.subject(entity, options)
30
- verb = entity.category == :verb ?
30
+ verb = (entity.has?(:category) && entity.category == :verb) ?
31
31
  main_verb(entity) : entity.main_verb
32
32
  args = []
33
33
  main_verb.edges.each_pair do |id,edge|
@@ -37,7 +37,7 @@ module Treat
37
37
  end
38
38
  # Return the object of the sentence|verb.
39
39
  def self.object(entity, options)
40
- verb = entity.category == :verb ?
40
+ verb = (entity.has?(:category) && entity.category == :verb) ?
41
41
  main_verb(entity) : entity.main_verb
42
42
  if verb.voice == 'passive'
43
43
  return
@@ -50,7 +50,7 @@ module Treat
50
50
  end
51
51
  # Find the main verb (shallowest verb in the tree).
52
52
  def self.main_verb(entity, options)
53
- verbs = entity.words_with_cat(:verb)
53
+ verbs = entity.verbs
54
54
  if verbs.empty?
55
55
  return
56
56
  end
@@ -52,24 +52,16 @@ module Treat
52
52
  @@tagger = nil
53
53
  # Hold the user-set options
54
54
  @@options = {}
55
- # Hold the default options.
56
- DefaultOptions = {
57
- lexicon: nil,
58
- lexical_rules: nil,
59
- contextual_rules: nil
60
- }
61
55
  # Tag words using a native Brill tagger.
62
56
  #
63
- # Available options:
57
+ # Options:
58
+ #
64
59
  # :lexicon => String (Lexicon file to use)
65
60
  # :lexical_rules => String (Lexical rule file to use)
66
61
  # :contextual_rules => String (Contextual rules file to use)
67
62
  def self.tag(entity, options = {})
68
63
  # Reinitialize the tagger if the options have changed.
69
- if options != @@options
70
- @@options = DefaultOptions.merge(options)
71
- @@tagger = nil # Reset the tagger
72
- end
64
+ @@tagger = nil if options != @@options
73
65
  # Create the tagger if necessary
74
66
  @@tagger ||= ::Brill::Tagger.new(options[:lexicon],
75
67
  options[:lexical_rules], options[:contextual_rules])
@@ -24,9 +24,8 @@ module Treat
24
24
  @@options = {}
25
25
  # Hold the default options.
26
26
  DefaultOptions = {
27
- unknown_word_tag: '?',
28
- relax: false,
29
- debug: false
27
+ unknown_word_tag: 'FW',
28
+ relax: false
30
29
  }
31
30
  # Tag the word using a probabilistic model taking
32
31
  # into account known words found in a lexicon and
@@ -34,11 +33,10 @@ module Treat
34
33
  #
35
34
  # Options:
36
35
  #
37
- # :relax => (Boolean) Relax the Hidden Markov Model:
36
+ # - (Boolean) :relax => Relax the Hidden Markov Model:
38
37
  # this may improve accuracy for uncommon words,
39
38
  # particularly words used polysemously.
40
- # :debug => (Boolean) Print debug messages.
41
- # :unknown_word_tag => (String) Tag for unknown words.
39
+ # - (String) :unknown_word_tag => Tag for unknown words.
42
40
  def self.tag(entity, options = {})
43
41
  # Reinitialize the tagger if the options have changed.
44
42
  if options != @@options