treat 0.1.1 → 0.1.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (81) hide show
  1. data/INSTALL +1 -0
  2. data/README +3 -0
  3. data/TODO +14 -26
  4. data/bin/INFO +1 -1
  5. data/lib/treat/buildable.rb +10 -11
  6. data/lib/treat/categories.rb +8 -6
  7. data/lib/treat/category.rb +7 -2
  8. data/lib/treat/delegatable.rb +64 -56
  9. data/lib/treat/detectors/encoding/r_chardet19.rb +1 -1
  10. data/lib/treat/detectors/language/language_detector.rb +2 -1
  11. data/lib/treat/detectors/language/what_language.rb +2 -2
  12. data/lib/treat/detectors.rb +3 -0
  13. data/lib/treat/entities/entity.rb +1 -1
  14. data/lib/treat/entities.rb +9 -10
  15. data/lib/treat/exception.rb +3 -1
  16. data/lib/treat/extractors/named_entity/abner.rb +1 -1
  17. data/lib/treat/extractors/named_entity/stanford.rb +2 -2
  18. data/lib/treat/extractors/time/chronic.rb +2 -2
  19. data/lib/treat/extractors/time/nickel.rb +2 -2
  20. data/lib/treat/extractors/topic_words/lda.rb +2 -2
  21. data/lib/treat/extractors.rb +12 -9
  22. data/lib/treat/feature.rb +6 -1
  23. data/lib/treat/formatters/cleaners/html.rb +1 -1
  24. data/lib/treat/formatters.rb +8 -8
  25. data/lib/treat/group.rb +11 -10
  26. data/lib/treat/inflectors/cardinal_words/linguistics.rb +3 -3
  27. data/lib/treat/inflectors/{conjugators → conjugations}/linguistics.rb +6 -6
  28. data/lib/treat/inflectors/{declensors → declensions}/en.rb +2 -2
  29. data/lib/treat/inflectors/{declensors → declensions}/linguistics.rb +5 -5
  30. data/lib/treat/inflectors/ordinal_words/linguistics.rb +4 -4
  31. data/lib/treat/inflectors/{stemmers → stem}/porter.rb +1 -1
  32. data/lib/treat/inflectors/{stemmers → stem}/porter_c.rb +3 -3
  33. data/lib/treat/inflectors/{stemmers → stem}/uea.rb +3 -3
  34. data/lib/treat/inflectors.rb +8 -21
  35. data/lib/treat/kernel.rb +120 -0
  36. data/lib/treat/languages/arabic.rb +14 -0
  37. data/lib/treat/languages/categories.rb +5 -0
  38. data/lib/treat/languages/chinese.rb +12 -0
  39. data/lib/treat/languages/english/categories.rb +23 -0
  40. data/lib/treat/{resources → languages/english}/tags.rb +127 -184
  41. data/lib/treat/languages/english.rb +33 -0
  42. data/lib/treat/languages/french.rb +17 -0
  43. data/lib/treat/languages/german.rb +17 -0
  44. data/lib/treat/languages/italian.rb +14 -0
  45. data/lib/treat/{resources/languages.txt → languages/list.txt} +0 -0
  46. data/lib/treat/languages/xinhua.rb +12 -0
  47. data/lib/treat/languages.rb +91 -0
  48. data/lib/treat/lexicalizers/category/from_tag.rb +20 -8
  49. data/lib/treat/lexicalizers/synsets/rita_wn.rb +1 -1
  50. data/lib/treat/lexicalizers/tag/brill.rb +2 -1
  51. data/lib/treat/lexicalizers/tag/lingua.rb +2 -1
  52. data/lib/treat/lexicalizers/tag/stanford.rb +16 -15
  53. data/lib/treat/lexicalizers.rb +1 -1
  54. data/lib/treat/object.rb +6 -0
  55. data/lib/treat/processors/parsers/enju.rb +3 -2
  56. data/lib/treat/processors/parsers/stanford.rb +15 -12
  57. data/lib/treat/processors/segmenters/punkt.rb +1 -1
  58. data/lib/treat/processors/segmenters/stanford.rb +7 -5
  59. data/lib/treat/processors/segmenters/tactful.rb +1 -1
  60. data/lib/treat/processors/tokenizers/multilingual.rb +2 -2
  61. data/lib/treat/processors/tokenizers/stanford.rb +7 -5
  62. data/lib/treat/visitable.rb +2 -1
  63. data/lib/treat.rb +105 -54
  64. data/test/tc_entity.rb +5 -0
  65. data/test/tc_resources.rb +5 -5
  66. data/test/tc_treat.rb +1 -2
  67. data/test/tests.rb +2 -1
  68. metadata +63 -64
  69. data/lib/treat/formatters/serializers/yaml/helper.rb +0 -96
  70. data/lib/treat/inflectors/lemmatizers/e_lemma/Makefile +0 -213
  71. data/lib/treat/inflectors/lemmatizers/e_lemma/elemma.c +0 -68
  72. data/lib/treat/inflectors/lemmatizers/e_lemma/extconf.rb +0 -6
  73. data/lib/treat/inflectors/lemmatizers/e_lemma.rb +0 -12
  74. data/lib/treat/resources/categories.rb +0 -18
  75. data/lib/treat/resources/delegates.rb +0 -96
  76. data/lib/treat/resources/dependencies.rb +0 -0
  77. data/lib/treat/resources/edges.rb +0 -8
  78. data/lib/treat/resources/formats.rb +0 -23
  79. data/lib/treat/resources/languages.rb +0 -86
  80. data/lib/treat/resources.rb +0 -10
  81. data/lib/treat/utilities.rb +0 -127
data/lib/treat/group.rb CHANGED
@@ -1,5 +1,6 @@
1
1
  module Treat
2
2
  module Group
3
+ # Modify the extended class.
3
4
  def self.extended(group)
4
5
  group.module_eval do
5
6
  class << self
@@ -75,22 +76,22 @@ module Treat
75
76
  end
76
77
  @@list[mod]
77
78
  end
78
- # Set inherit to false by default.
79
- def const_get(const)
80
- super(const, false)
81
- end
79
+ # Get constants in this module, excluding those
80
+ # defined by parent modules.
81
+ def const_get(const); super(const, false); end
82
82
  # Autoload the algorithms.
83
83
  def const_missing(const)
84
84
  bits = self.ancestors[0].to_s.split('::')
85
85
  bits.collect! { |bit| ucc(bit) }
86
- file = bits.join('/') + "/#{ucc(const)}" # Fix
87
- #if not File.readable?(file + '.rb')
88
- # raise Treat::Exception,
89
- # "File '#{file}.rb' corresponding to requested delegate "+
90
- # "#{self}::#{const} does not exist."
86
+ file = bits.join('/') + "/#{ucc(const)}"
87
+ if not File.readable?("#{Treat.lib}/#{file}.rb")
88
+ raise Treat::Exception,
89
+ "File '#{file}.rb' corresponding to requested delegate "+
90
+ "#{self}::#{const} does not exist."
91
+ else
91
92
  require file
92
93
  const_get(const)
93
- #end
94
+ end
94
95
  end
95
96
  end
96
97
  end
@@ -2,7 +2,7 @@ module Treat
2
2
  module Inflectors
3
3
  module CardinalWords
4
4
  class Linguistics
5
- silently { require 'linguistics' }
5
+ silence_warnings { require 'linguistics' }
6
6
  #
7
7
  # Options:
8
8
  #
@@ -32,12 +32,12 @@ module Treat
32
32
  begin
33
33
  l = entity.language.to_s.upcase
34
34
  delegate = nil
35
- silently { delegate = ::Linguistics.const_get(l) }
35
+ silence_warnings { delegate = ::Linguistics.const_get(l) }
36
36
  rescue RuntimeError
37
37
  raise "Ruby Linguistics does not have a module " +
38
38
  " installed for the #{entity.language} language."
39
39
  end
40
- silently { delegate.numwords(entity.to_s, options) }
40
+ silence_warnings { delegate.numwords(entity.to_s, options) }
41
41
  end
42
42
  end
43
43
  end
@@ -1,23 +1,23 @@
1
1
  module Treat
2
2
  module Inflectors
3
- module Conjugators
3
+ module Conjugations
4
4
  class Linguistics
5
- silently { require 'linguistics' }
5
+ silence_warnings { require 'linguistics' }
6
6
  def self.conjugate(entity, parameters)
7
7
  begin
8
8
  l = entity.language.to_s.upcase
9
9
  delegate = nil
10
- silently { delegate = ::Linguistics.const_get(l) }
10
+ silence_warnings { delegate = ::Linguistics.const_get(l) }
11
11
  rescue RuntimeError
12
12
  raise "Ruby Linguistics does not have a module " +
13
13
  " installed for the #{entity.language} language."
14
14
  end
15
15
  if parameters[:mode] == :infinitive
16
- silently { delegate.infinitive(entity.to_s) }
16
+ silence_warnings { delegate.infinitive(entity.to_s) }
17
17
  elsif parameters[:mode] == :participle && parameters[:tense] == :present
18
- silently { delegate.present_participle(entity.to_s) }
18
+ silence_warnings { delegate.present_participle(entity.to_s) }
19
19
  elsif parameters[:count] == :plural && parameters.size == 1
20
- silently { delegate.plural_verb(entity.to_s) }
20
+ silence_warnings { delegate.plural_verb(entity.to_s) }
21
21
  else
22
22
  raise Treat::Exception,
23
23
  'This combination of modes, tenses, persons ' +
@@ -1,8 +1,8 @@
1
- silently { require 'english' }
1
+ silence_warnings { require 'english' }
2
2
 
3
3
  module Treat
4
4
  module Inflectors
5
- module Declensors
5
+ module Declensions
6
6
  module En
7
7
  def self.declense(entity, options)
8
8
  string = entity.to_s
@@ -1,7 +1,7 @@
1
1
  module Treat
2
2
  module Inflectors
3
- module Declensors
4
- silently { require 'linguistics' }
3
+ module Declensions
4
+ silence_warnings { require 'linguistics' }
5
5
  # Obtain word declensions in English using the
6
6
  # ruby 'linguistics' gem.
7
7
  class Linguistics
@@ -9,7 +9,7 @@ module Treat
9
9
  begin
10
10
  l = entity.language.to_s.upcase
11
11
  delegate = nil
12
- silently { delegate = ::Linguistics.const_get(l) }
12
+ silence_warnings { delegate = ::Linguistics.const_get(l) }
13
13
  rescue RuntimeError
14
14
  raise "Ruby Linguistics does not have a module " +
15
15
  " installed for the #{entity.language} language."
@@ -18,9 +18,9 @@ module Treat
18
18
  if options[:count] == :plural
19
19
  if entity.has?(:category) &&
20
20
  [:noun, :adjective, :verb].include?(entity.category)
21
- silently { delegate.send(:"plural_#{entity.category}", string) }
21
+ silence_warnings { delegate.send(:"plural_#{entity.category}", string) }
22
22
  else
23
- silently { delegate.plural(string) }
23
+ silence_warnings { delegate.plural(string) }
24
24
  end
25
25
  end
26
26
  end
@@ -2,18 +2,18 @@ module Treat
2
2
  module Inflectors
3
3
  module OrdinalWords
4
4
  class Linguistics
5
- silently { require 'linguistics' }
5
+ silence_warnings { require 'linguistics' }
6
6
  def self.ordinal_words(number, options = {})
7
7
  begin
8
8
  l = number.language.to_s.upcase
9
9
  delegate = nil
10
- silently { delegate = ::Linguistics.const_get(l) }
10
+ silence_warnings { delegate = ::Linguistics.const_get(l) }
11
11
  rescue RuntimeError
12
- lang = Treat::Resources::Language.describe(number.language)
12
+ lang = Treat::Languages.describe(number.language)
13
13
  raise "Ruby Linguistics does not have a module " +
14
14
  " installed for the #{lang} language."
15
15
  end
16
- silently { delegate.ordinate(number.to_s) }
16
+ silence_warnings { delegate.ordinate(number.to_s) }
17
17
  end
18
18
  end
19
19
  end
@@ -1,6 +1,6 @@
1
1
  module Treat
2
2
  module Inflectors
3
- module Stemmers
3
+ module Stem
4
4
  # Stem a word using a native Ruby implementation of the
5
5
  # Porter stemming algorithm, ported to Ruby from the
6
6
  # version coded up in Perl.
@@ -1,6 +1,6 @@
1
1
  module Treat
2
2
  module Inflectors
3
- module Stemmers
3
+ module Stem
4
4
  # Stems words using the 'ruby-stemmer' gem, which
5
5
  # wraps a C version of the Porter stemming algorithm.
6
6
  #
@@ -9,13 +9,13 @@ module Treat
9
9
  # Program, Vol. 14, no. 3, pp 130-137,
10
10
  # Original C implementation: http://www.tartarus.org/~martin/PorterStemmer.
11
11
  class PorterC
12
- silently { require 'lingua/stemmer' }
12
+ silence_warnings { require 'lingua/stemmer' }
13
13
  ::LinguaStemmer = ::Lingua
14
14
  Object.instance_eval { remove_const :Lingua }
15
15
  # Stem the word using the Porter C algorithm.
16
16
  # Options: none.
17
17
  def self.stem(word, options = {})
18
- silently { ::LinguaStemmer.stemmer(word.to_s) }
18
+ silence_warnings { ::LinguaStemmer.stemmer(word.to_s) }
19
19
  end
20
20
  end
21
21
  end
@@ -1,6 +1,6 @@
1
1
  module Treat
2
2
  module Inflectors
3
- module Stemmers
3
+ module Stem
4
4
  # Stems a word using the UEA algorithm, implemented
5
5
  # by the 'uea-stemmer' gem.
6
6
  #
@@ -15,13 +15,13 @@ module Treat
15
15
  # http://www.uea.ac.uk/polopoly_fs/1.85493!stemmer25feb.pdf
16
16
  class UEA
17
17
  # Require the 'uea-stemmer' gem.
18
- silently { require 'uea-stemmer' }
18
+ silence_warnings { require 'uea-stemmer' }
19
19
  # Keep only one copy of the stemmer.
20
20
  @@stemmer = nil
21
21
  # Stems a word using the UEA algorithm, implemented
22
22
  # by the 'uea-stemmer' gem.
23
23
  def self.stem(entity, options = {})
24
- @@stemmer ||= silently { ::UEAStemmer.new }
24
+ @@stemmer ||= silence_warnings { ::UEAStemmer.new }
25
25
  @@stemmer.stem(entity.to_s).strip
26
26
  end
27
27
  end
@@ -1,44 +1,31 @@
1
1
  module Treat
2
2
  # Algorithms to retrieve the inflections of a word.
3
- # Stemmers return the stem (not root form) of a word.
4
- # Taggers return the part of speech tag of a word.
5
- # Inflectors allow to retrieve the different inflections of a
6
- # noun (declensions), a verb (conjugations). Lexicons return,
7
- # among other things, the gloss or synset of a word.
8
3
  module Inflectors
9
- # Lemmatizers return the root form of a word.
10
- module Lemmatizers
4
+ # Return the stem (*not root form*) of a word.
5
+ module Stem
11
6
  extend Group
12
7
  self.type = :annotator
13
8
  self.targets = [:word]
14
9
  end
15
- # Stemmers return the stem (*not root form*) of a word.
16
- module Stemmers
10
+ # Retrieve the different declensions of a noun (singular, plural).
11
+ module Declensions
17
12
  extend Group
18
13
  self.type = :annotator
19
14
  self.targets = [:word]
20
15
  end
21
- # Declensors allow to retrieve the different declensions of a
22
- # noun (singular, plural).
23
- module Declensors
16
+ # Retrieve the different conjugations of a word.
17
+ module Conjugations
24
18
  extend Group
25
19
  self.type = :annotator
26
20
  self.targets = [:word]
27
21
  end
28
- # Conjugators allow to retrieve the different conjugations of
29
- # a word.
30
- module Conjugators
31
- extend Group
32
- self.type = :annotator
33
- self.targets = [:word]
34
- end
35
- # Cardinal retrieve the full text description of a number.
22
+ # Retrieve the full text description of a cardinal number.
36
23
  module CardinalWords
37
24
  extend Group
38
25
  self.type = :annotator
39
26
  self.targets = [:number]
40
27
  end
41
- # Ordinal retrieve the ordinal form of numbers.
28
+ # Retrieve the full text description of an ordinal number.
42
29
  module OrdinalWords
43
30
  extend Group
44
31
  self.type = :annotator
@@ -0,0 +1,120 @@
1
+ # Extends the core Kernel module to provide
2
+ # easy access to utility functions used across
3
+ # the library.
4
+ module Kernel
5
+ require 'fileutils'
6
+ require 'tempfile'
7
+ # A list of acronyms used in class names within
8
+ # the program. These do not CamelCase; they
9
+ # CAMELCase.
10
+ Acronyms = ['XML', 'HTML', 'YAML', 'UEA', 'LDA', 'PDF', 'GOCR'].join('|')
11
+ # A cache to optimize camel casing.
12
+ @@cc_cache = {}
13
+ # A cache to optimize un camel casing.
14
+ @@ucc_cache = {}
15
+ # Returns the platform we are running on.
16
+ def platform
17
+ RUBY_PLATFORM.split("-")[1]
18
+ end
19
+ # Runs a block of code without warnings.
20
+ def silence_warnings(&block)
21
+ warn_level = $VERBOSE
22
+ $VERBOSE = nil
23
+ result = block.call
24
+ $VERBOSE = warn_level
25
+ result
26
+ end
27
+ # Runs a block of code while blocking
28
+ # stdout. Currently not implemented.
29
+ def silence_streams(*streams)
30
+ yield
31
+ end
32
+ # Create a temporary file which is deleted
33
+ # after execution of the block.
34
+ def create_temp_file(ext, value = nil, &block)
35
+ tmp = Tempfile.new(['', ".#{ext.to_s}"], Treat.tmp)
36
+ tmp.puts(value) if value
37
+ block.call(tmp.path)
38
+ end
39
+ # Convert un_camel_case to CamelCase.
40
+ def camel_case(o_phrase)
41
+ phrase = o_phrase.to_s.dup
42
+ return @@cc_cache[o_phrase] if @@cc_cache[o_phrase]
43
+ phrase.gsub!(/#{Acronyms.downcase}[^a-z]+/) { |a| a.upcase }
44
+ phrase.gsub!(/^[a-z]|_[a-z]/) { |a| a.upcase }
45
+ phrase.gsub!('_', '')
46
+ @@cc_cache[o_phrase] = phrase
47
+ phrase
48
+ end
49
+ alias :cc :camel_case
50
+ # Convert CamelCase to un_camel_case.
51
+ def un_camel_case(o_phrase)
52
+ phrase = o_phrase.to_s.dup
53
+ return @@ucc_cache[o_phrase] if @@ucc_cache[o_phrase]
54
+ phrase.gsub!(/#{Acronyms}/) { |a| a.downcase.capitalize }
55
+ phrase.gsub!(/[A-Z]/) { |p| '_' + p.downcase }
56
+ phrase = phrase[1..-1] if phrase[0] == '_'
57
+ @@ucc_cache[o_phrase] = phrase
58
+ phrase
59
+ end
60
+ alias :ucc :un_camel_case
61
+ # Retrieve the Class from a Module::Class.
62
+ def class_name(n); n.to_s.split('::')[-1]; end
63
+ alias :cl :class_name
64
+ # Search the list to see if there are words similar to #name
65
+ # in the #list If yes, return a string saying "Did you mean
66
+ # ... ?" with the names.
67
+ def did_you_mean?(list, name)
68
+ msg = ''
69
+ sugg = []
70
+ list.each do |element|
71
+ l = levenshtein(element,name)
72
+ if l > 0 && l < 2
73
+ sugg << element
74
+ end
75
+ end
76
+ unless sugg.empty?
77
+ if sugg.size == 1
78
+ msg += " Perhaps you meant '#{sugg[0]}' ?"
79
+ else
80
+ sugg_quote = sugg[0..-2].map {|x| '\'' + x + '\''}
81
+ msg += " Perhaps you meant #{sugg_quote.join(', ')}," +
82
+ " or '#{sugg[-1]}' ?"
83
+ end
84
+ end
85
+ msg
86
+ end
87
+ alias :dym? :did_you_mean?
88
+ # Return the name of the method that called the method
89
+ # that calls this method.
90
+ def caller_method(n = 3)
91
+ at = caller(n).first
92
+ /^(.+?):(\d+)(?::in `(.*)')?/ =~ at
93
+ :"#{Regexp.last_match[3]}"
94
+ end
95
+ alias :cm :caller_method
96
+ # Return the levensthein distance between two stringsm
97
+ # taking into account the costs of insertion, deletion,
98
+ # and substitution. Stolen from:
99
+ # http://ruby-snippets.heroku.com/string/levenshtein-distance
100
+ # Used by did_you_mean?
101
+ def levenshtein(first, other, ins=1, del=1, sub=1)
102
+ return nil if first.nil? || other.nil?
103
+ dm = []
104
+ dm[0] = (0..first.length).collect { |i| i * ins}
105
+ fill = [0] * (first.length - 1)
106
+ for i in 1..other.length
107
+ dm[i] = [i * del, fill.flatten]
108
+ end
109
+ for i in 1..other.length
110
+ for j in 1..first.length
111
+ dm[i][j] = [
112
+ dm[i-1][j-1] + (first[i-1] == other[i-1] ? 0 : sub),
113
+ dm[i][j-1] + ins,
114
+ dm[i-1][j] + del
115
+ ].min
116
+ end
117
+ end
118
+ dm[other.length][first.length]
119
+ end
120
+ end
@@ -0,0 +1,14 @@
1
+ module Treat
2
+ module Languages
3
+ class Arabic
4
+ Extractors = {}
5
+ Inflectors = {}
6
+ Lexicalizers = {
7
+ tag: [:stanford]
8
+ }
9
+ Processors = {
10
+ parsers: [:stanford]
11
+ }
12
+ end
13
+ end
14
+ end
@@ -0,0 +1,5 @@
1
+ module Treat
2
+ module Languages
3
+
4
+ end
5
+ end
@@ -0,0 +1,12 @@
1
+ module Treat
2
+ module Languages
3
+ class Chinese
4
+ Extractors = {}
5
+ Inflectors = {}
6
+ Lexicalizers = {
7
+ tag: [:stanford]
8
+ }
9
+ Processors = {}
10
+ end
11
+ end
12
+ end
@@ -0,0 +1,23 @@
1
+ module Treat
2
+ module Languages
3
+ class English
4
+ # A list of all possible word categories.
5
+ Categories = [
6
+ :adjective, :adverb, :noun, :verb, :interjection,
7
+ :clitic, :coverb, :conjunction, :determiner, :particle,
8
+ :preposition, :pronoun, :number, :symbol, :punctuation,
9
+ :complementizer
10
+ ]
11
+ wttc = {}
12
+ Treat::Languages::English::AlignedWordTags.each_slice(2) do |desc, tags|
13
+ category = desc.gsub(',', ' ,').split(' ')[0].downcase.intern
14
+ wttc[tags[0]] ||= {}; wttc[tags[1]] ||= {} ;wttc[tags[2]] ||= {}
15
+ wttc[tags[0]][:claws_5] = category
16
+ wttc[tags[1]][:brown] = category
17
+ wttc[tags[2]][:penn] = category
18
+ end
19
+ # A hash converting word tags to word categories.
20
+ WordTagToCategory = wttc
21
+ end
22
+ end
23
+ end