treat 0.1.1 → 0.1.2

Sign up to get free protection for your applications and to get access to all the features.
Files changed (81) hide show
  1. data/INSTALL +1 -0
  2. data/README +3 -0
  3. data/TODO +14 -26
  4. data/bin/INFO +1 -1
  5. data/lib/treat/buildable.rb +10 -11
  6. data/lib/treat/categories.rb +8 -6
  7. data/lib/treat/category.rb +7 -2
  8. data/lib/treat/delegatable.rb +64 -56
  9. data/lib/treat/detectors/encoding/r_chardet19.rb +1 -1
  10. data/lib/treat/detectors/language/language_detector.rb +2 -1
  11. data/lib/treat/detectors/language/what_language.rb +2 -2
  12. data/lib/treat/detectors.rb +3 -0
  13. data/lib/treat/entities/entity.rb +1 -1
  14. data/lib/treat/entities.rb +9 -10
  15. data/lib/treat/exception.rb +3 -1
  16. data/lib/treat/extractors/named_entity/abner.rb +1 -1
  17. data/lib/treat/extractors/named_entity/stanford.rb +2 -2
  18. data/lib/treat/extractors/time/chronic.rb +2 -2
  19. data/lib/treat/extractors/time/nickel.rb +2 -2
  20. data/lib/treat/extractors/topic_words/lda.rb +2 -2
  21. data/lib/treat/extractors.rb +12 -9
  22. data/lib/treat/feature.rb +6 -1
  23. data/lib/treat/formatters/cleaners/html.rb +1 -1
  24. data/lib/treat/formatters.rb +8 -8
  25. data/lib/treat/group.rb +11 -10
  26. data/lib/treat/inflectors/cardinal_words/linguistics.rb +3 -3
  27. data/lib/treat/inflectors/{conjugators → conjugations}/linguistics.rb +6 -6
  28. data/lib/treat/inflectors/{declensors → declensions}/en.rb +2 -2
  29. data/lib/treat/inflectors/{declensors → declensions}/linguistics.rb +5 -5
  30. data/lib/treat/inflectors/ordinal_words/linguistics.rb +4 -4
  31. data/lib/treat/inflectors/{stemmers → stem}/porter.rb +1 -1
  32. data/lib/treat/inflectors/{stemmers → stem}/porter_c.rb +3 -3
  33. data/lib/treat/inflectors/{stemmers → stem}/uea.rb +3 -3
  34. data/lib/treat/inflectors.rb +8 -21
  35. data/lib/treat/kernel.rb +120 -0
  36. data/lib/treat/languages/arabic.rb +14 -0
  37. data/lib/treat/languages/categories.rb +5 -0
  38. data/lib/treat/languages/chinese.rb +12 -0
  39. data/lib/treat/languages/english/categories.rb +23 -0
  40. data/lib/treat/{resources → languages/english}/tags.rb +127 -184
  41. data/lib/treat/languages/english.rb +33 -0
  42. data/lib/treat/languages/french.rb +17 -0
  43. data/lib/treat/languages/german.rb +17 -0
  44. data/lib/treat/languages/italian.rb +14 -0
  45. data/lib/treat/{resources/languages.txt → languages/list.txt} +0 -0
  46. data/lib/treat/languages/xinhua.rb +12 -0
  47. data/lib/treat/languages.rb +91 -0
  48. data/lib/treat/lexicalizers/category/from_tag.rb +20 -8
  49. data/lib/treat/lexicalizers/synsets/rita_wn.rb +1 -1
  50. data/lib/treat/lexicalizers/tag/brill.rb +2 -1
  51. data/lib/treat/lexicalizers/tag/lingua.rb +2 -1
  52. data/lib/treat/lexicalizers/tag/stanford.rb +16 -15
  53. data/lib/treat/lexicalizers.rb +1 -1
  54. data/lib/treat/object.rb +6 -0
  55. data/lib/treat/processors/parsers/enju.rb +3 -2
  56. data/lib/treat/processors/parsers/stanford.rb +15 -12
  57. data/lib/treat/processors/segmenters/punkt.rb +1 -1
  58. data/lib/treat/processors/segmenters/stanford.rb +7 -5
  59. data/lib/treat/processors/segmenters/tactful.rb +1 -1
  60. data/lib/treat/processors/tokenizers/multilingual.rb +2 -2
  61. data/lib/treat/processors/tokenizers/stanford.rb +7 -5
  62. data/lib/treat/visitable.rb +2 -1
  63. data/lib/treat.rb +105 -54
  64. data/test/tc_entity.rb +5 -0
  65. data/test/tc_resources.rb +5 -5
  66. data/test/tc_treat.rb +1 -2
  67. data/test/tests.rb +2 -1
  68. metadata +63 -64
  69. data/lib/treat/formatters/serializers/yaml/helper.rb +0 -96
  70. data/lib/treat/inflectors/lemmatizers/e_lemma/Makefile +0 -213
  71. data/lib/treat/inflectors/lemmatizers/e_lemma/elemma.c +0 -68
  72. data/lib/treat/inflectors/lemmatizers/e_lemma/extconf.rb +0 -6
  73. data/lib/treat/inflectors/lemmatizers/e_lemma.rb +0 -12
  74. data/lib/treat/resources/categories.rb +0 -18
  75. data/lib/treat/resources/delegates.rb +0 -96
  76. data/lib/treat/resources/dependencies.rb +0 -0
  77. data/lib/treat/resources/edges.rb +0 -8
  78. data/lib/treat/resources/formats.rb +0 -23
  79. data/lib/treat/resources/languages.rb +0 -86
  80. data/lib/treat/resources.rb +0 -10
  81. data/lib/treat/utilities.rb +0 -127
data/lib/treat/group.rb CHANGED
@@ -1,5 +1,6 @@
1
1
  module Treat
2
2
  module Group
3
+ # Modify the extended class.
3
4
  def self.extended(group)
4
5
  group.module_eval do
5
6
  class << self
@@ -75,22 +76,22 @@ module Treat
75
76
  end
76
77
  @@list[mod]
77
78
  end
78
- # Set inherit to false by default.
79
- def const_get(const)
80
- super(const, false)
81
- end
79
+ # Get constants in this module, excluding those
80
+ # defined by parent modules.
81
+ def const_get(const); super(const, false); end
82
82
  # Autoload the algorithms.
83
83
  def const_missing(const)
84
84
  bits = self.ancestors[0].to_s.split('::')
85
85
  bits.collect! { |bit| ucc(bit) }
86
- file = bits.join('/') + "/#{ucc(const)}" # Fix
87
- #if not File.readable?(file + '.rb')
88
- # raise Treat::Exception,
89
- # "File '#{file}.rb' corresponding to requested delegate "+
90
- # "#{self}::#{const} does not exist."
86
+ file = bits.join('/') + "/#{ucc(const)}"
87
+ if not File.readable?("#{Treat.lib}/#{file}.rb")
88
+ raise Treat::Exception,
89
+ "File '#{file}.rb' corresponding to requested delegate "+
90
+ "#{self}::#{const} does not exist."
91
+ else
91
92
  require file
92
93
  const_get(const)
93
- #end
94
+ end
94
95
  end
95
96
  end
96
97
  end
@@ -2,7 +2,7 @@ module Treat
2
2
  module Inflectors
3
3
  module CardinalWords
4
4
  class Linguistics
5
- silently { require 'linguistics' }
5
+ silence_warnings { require 'linguistics' }
6
6
  #
7
7
  # Options:
8
8
  #
@@ -32,12 +32,12 @@ module Treat
32
32
  begin
33
33
  l = entity.language.to_s.upcase
34
34
  delegate = nil
35
- silently { delegate = ::Linguistics.const_get(l) }
35
+ silence_warnings { delegate = ::Linguistics.const_get(l) }
36
36
  rescue RuntimeError
37
37
  raise "Ruby Linguistics does not have a module " +
38
38
  " installed for the #{entity.language} language."
39
39
  end
40
- silently { delegate.numwords(entity.to_s, options) }
40
+ silence_warnings { delegate.numwords(entity.to_s, options) }
41
41
  end
42
42
  end
43
43
  end
@@ -1,23 +1,23 @@
1
1
  module Treat
2
2
  module Inflectors
3
- module Conjugators
3
+ module Conjugations
4
4
  class Linguistics
5
- silently { require 'linguistics' }
5
+ silence_warnings { require 'linguistics' }
6
6
  def self.conjugate(entity, parameters)
7
7
  begin
8
8
  l = entity.language.to_s.upcase
9
9
  delegate = nil
10
- silently { delegate = ::Linguistics.const_get(l) }
10
+ silence_warnings { delegate = ::Linguistics.const_get(l) }
11
11
  rescue RuntimeError
12
12
  raise "Ruby Linguistics does not have a module " +
13
13
  " installed for the #{entity.language} language."
14
14
  end
15
15
  if parameters[:mode] == :infinitive
16
- silently { delegate.infinitive(entity.to_s) }
16
+ silence_warnings { delegate.infinitive(entity.to_s) }
17
17
  elsif parameters[:mode] == :participle && parameters[:tense] == :present
18
- silently { delegate.present_participle(entity.to_s) }
18
+ silence_warnings { delegate.present_participle(entity.to_s) }
19
19
  elsif parameters[:count] == :plural && parameters.size == 1
20
- silently { delegate.plural_verb(entity.to_s) }
20
+ silence_warnings { delegate.plural_verb(entity.to_s) }
21
21
  else
22
22
  raise Treat::Exception,
23
23
  'This combination of modes, tenses, persons ' +
@@ -1,8 +1,8 @@
1
- silently { require 'english' }
1
+ silence_warnings { require 'english' }
2
2
 
3
3
  module Treat
4
4
  module Inflectors
5
- module Declensors
5
+ module Declensions
6
6
  module En
7
7
  def self.declense(entity, options)
8
8
  string = entity.to_s
@@ -1,7 +1,7 @@
1
1
  module Treat
2
2
  module Inflectors
3
- module Declensors
4
- silently { require 'linguistics' }
3
+ module Declensions
4
+ silence_warnings { require 'linguistics' }
5
5
  # Obtain word declensions in English using the
6
6
  # ruby 'linguistics' gem.
7
7
  class Linguistics
@@ -9,7 +9,7 @@ module Treat
9
9
  begin
10
10
  l = entity.language.to_s.upcase
11
11
  delegate = nil
12
- silently { delegate = ::Linguistics.const_get(l) }
12
+ silence_warnings { delegate = ::Linguistics.const_get(l) }
13
13
  rescue RuntimeError
14
14
  raise "Ruby Linguistics does not have a module " +
15
15
  " installed for the #{entity.language} language."
@@ -18,9 +18,9 @@ module Treat
18
18
  if options[:count] == :plural
19
19
  if entity.has?(:category) &&
20
20
  [:noun, :adjective, :verb].include?(entity.category)
21
- silently { delegate.send(:"plural_#{entity.category}", string) }
21
+ silence_warnings { delegate.send(:"plural_#{entity.category}", string) }
22
22
  else
23
- silently { delegate.plural(string) }
23
+ silence_warnings { delegate.plural(string) }
24
24
  end
25
25
  end
26
26
  end
@@ -2,18 +2,18 @@ module Treat
2
2
  module Inflectors
3
3
  module OrdinalWords
4
4
  class Linguistics
5
- silently { require 'linguistics' }
5
+ silence_warnings { require 'linguistics' }
6
6
  def self.ordinal_words(number, options = {})
7
7
  begin
8
8
  l = number.language.to_s.upcase
9
9
  delegate = nil
10
- silently { delegate = ::Linguistics.const_get(l) }
10
+ silence_warnings { delegate = ::Linguistics.const_get(l) }
11
11
  rescue RuntimeError
12
- lang = Treat::Resources::Language.describe(number.language)
12
+ lang = Treat::Languages.describe(number.language)
13
13
  raise "Ruby Linguistics does not have a module " +
14
14
  " installed for the #{lang} language."
15
15
  end
16
- silently { delegate.ordinate(number.to_s) }
16
+ silence_warnings { delegate.ordinate(number.to_s) }
17
17
  end
18
18
  end
19
19
  end
@@ -1,6 +1,6 @@
1
1
  module Treat
2
2
  module Inflectors
3
- module Stemmers
3
+ module Stem
4
4
  # Stem a word using a native Ruby implementation of the
5
5
  # Porter stemming algorithm, ported to Ruby from the
6
6
  # version coded up in Perl.
@@ -1,6 +1,6 @@
1
1
  module Treat
2
2
  module Inflectors
3
- module Stemmers
3
+ module Stem
4
4
  # Stems words using the 'ruby-stemmer' gem, which
5
5
  # wraps a C version of the Porter stemming algorithm.
6
6
  #
@@ -9,13 +9,13 @@ module Treat
9
9
  # Program, Vol. 14, no. 3, pp 130-137,
10
10
  # Original C implementation: http://www.tartarus.org/~martin/PorterStemmer.
11
11
  class PorterC
12
- silently { require 'lingua/stemmer' }
12
+ silence_warnings { require 'lingua/stemmer' }
13
13
  ::LinguaStemmer = ::Lingua
14
14
  Object.instance_eval { remove_const :Lingua }
15
15
  # Stem the word using the Porter C algorithm.
16
16
  # Options: none.
17
17
  def self.stem(word, options = {})
18
- silently { ::LinguaStemmer.stemmer(word.to_s) }
18
+ silence_warnings { ::LinguaStemmer.stemmer(word.to_s) }
19
19
  end
20
20
  end
21
21
  end
@@ -1,6 +1,6 @@
1
1
  module Treat
2
2
  module Inflectors
3
- module Stemmers
3
+ module Stem
4
4
  # Stems a word using the UEA algorithm, implemented
5
5
  # by the 'uea-stemmer' gem.
6
6
  #
@@ -15,13 +15,13 @@ module Treat
15
15
  # http://www.uea.ac.uk/polopoly_fs/1.85493!stemmer25feb.pdf
16
16
  class UEA
17
17
  # Require the 'uea-stemmer' gem.
18
- silently { require 'uea-stemmer' }
18
+ silence_warnings { require 'uea-stemmer' }
19
19
  # Keep only one copy of the stemmer.
20
20
  @@stemmer = nil
21
21
  # Stems a word using the UEA algorithm, implemented
22
22
  # by the 'uea-stemmer' gem.
23
23
  def self.stem(entity, options = {})
24
- @@stemmer ||= silently { ::UEAStemmer.new }
24
+ @@stemmer ||= silence_warnings { ::UEAStemmer.new }
25
25
  @@stemmer.stem(entity.to_s).strip
26
26
  end
27
27
  end
@@ -1,44 +1,31 @@
1
1
  module Treat
2
2
  # Algorithms to retrieve the inflections of a word.
3
- # Stemmers return the stem (not root form) of a word.
4
- # Taggers return the part of speech tag of a word.
5
- # Inflectors allow to retrieve the different inflections of a
6
- # noun (declensions), a verb (conjugations). Lexicons return,
7
- # among other things, the gloss or synset of a word.
8
3
  module Inflectors
9
- # Lemmatizers return the root form of a word.
10
- module Lemmatizers
4
+ # Return the stem (*not root form*) of a word.
5
+ module Stem
11
6
  extend Group
12
7
  self.type = :annotator
13
8
  self.targets = [:word]
14
9
  end
15
- # Stemmers return the stem (*not root form*) of a word.
16
- module Stemmers
10
+ # Retrieve the different declensions of a noun (singular, plural).
11
+ module Declensions
17
12
  extend Group
18
13
  self.type = :annotator
19
14
  self.targets = [:word]
20
15
  end
21
- # Declensors allow to retrieve the different declensions of a
22
- # noun (singular, plural).
23
- module Declensors
16
+ # Retrieve the different conjugations of a word.
17
+ module Conjugations
24
18
  extend Group
25
19
  self.type = :annotator
26
20
  self.targets = [:word]
27
21
  end
28
- # Conjugators allow to retrieve the different conjugations of
29
- # a word.
30
- module Conjugators
31
- extend Group
32
- self.type = :annotator
33
- self.targets = [:word]
34
- end
35
- # Cardinal retrieve the full text description of a number.
22
+ # Retrieve the full text description of a cardinal number.
36
23
  module CardinalWords
37
24
  extend Group
38
25
  self.type = :annotator
39
26
  self.targets = [:number]
40
27
  end
41
- # Ordinal retrieve the ordinal form of numbers.
28
+ # Retrieve the full text description of an ordinal number.
42
29
  module OrdinalWords
43
30
  extend Group
44
31
  self.type = :annotator
@@ -0,0 +1,120 @@
1
+ # Extends the core Kernel module to provide
2
+ # easy access to utility functions used across
3
+ # the library.
4
+ module Kernel
5
+ require 'fileutils'
6
+ require 'tempfile'
7
+ # A list of acronyms used in class names within
8
+ # the program. These do not CamelCase; they
9
+ # CAMELCase.
10
+ Acronyms = ['XML', 'HTML', 'YAML', 'UEA', 'LDA', 'PDF', 'GOCR'].join('|')
11
+ # A cache to optimize camel casing.
12
+ @@cc_cache = {}
13
+ # A cache to optimize un camel casing.
14
+ @@ucc_cache = {}
15
+ # Returns the platform we are running on.
16
+ def platform
17
+ RUBY_PLATFORM.split("-")[1]
18
+ end
19
+ # Runs a block of code without warnings.
20
+ def silence_warnings(&block)
21
+ warn_level = $VERBOSE
22
+ $VERBOSE = nil
23
+ result = block.call
24
+ $VERBOSE = warn_level
25
+ result
26
+ end
27
+ # Runs a block of code while blocking
28
+ # stdout. Currently not implemented.
29
+ def silence_streams(*streams)
30
+ yield
31
+ end
32
+ # Create a temporary file which is deleted
33
+ # after execution of the block.
34
+ def create_temp_file(ext, value = nil, &block)
35
+ tmp = Tempfile.new(['', ".#{ext.to_s}"], Treat.tmp)
36
+ tmp.puts(value) if value
37
+ block.call(tmp.path)
38
+ end
39
+ # Convert un_camel_case to CamelCase.
40
+ def camel_case(o_phrase)
41
+ phrase = o_phrase.to_s.dup
42
+ return @@cc_cache[o_phrase] if @@cc_cache[o_phrase]
43
+ phrase.gsub!(/#{Acronyms.downcase}[^a-z]+/) { |a| a.upcase }
44
+ phrase.gsub!(/^[a-z]|_[a-z]/) { |a| a.upcase }
45
+ phrase.gsub!('_', '')
46
+ @@cc_cache[o_phrase] = phrase
47
+ phrase
48
+ end
49
+ alias :cc :camel_case
50
+ # Convert CamelCase to un_camel_case.
51
+ def un_camel_case(o_phrase)
52
+ phrase = o_phrase.to_s.dup
53
+ return @@ucc_cache[o_phrase] if @@ucc_cache[o_phrase]
54
+ phrase.gsub!(/#{Acronyms}/) { |a| a.downcase.capitalize }
55
+ phrase.gsub!(/[A-Z]/) { |p| '_' + p.downcase }
56
+ phrase = phrase[1..-1] if phrase[0] == '_'
57
+ @@ucc_cache[o_phrase] = phrase
58
+ phrase
59
+ end
60
+ alias :ucc :un_camel_case
61
+ # Retrieve the Class from a Module::Class.
62
+ def class_name(n); n.to_s.split('::')[-1]; end
63
+ alias :cl :class_name
64
+ # Search the list to see if there are words similar to #name
65
+ # in the #list If yes, return a string saying "Did you mean
66
+ # ... ?" with the names.
67
+ def did_you_mean?(list, name)
68
+ msg = ''
69
+ sugg = []
70
+ list.each do |element|
71
+ l = levenshtein(element,name)
72
+ if l > 0 && l < 2
73
+ sugg << element
74
+ end
75
+ end
76
+ unless sugg.empty?
77
+ if sugg.size == 1
78
+ msg += " Perhaps you meant '#{sugg[0]}' ?"
79
+ else
80
+ sugg_quote = sugg[0..-2].map {|x| '\'' + x + '\''}
81
+ msg += " Perhaps you meant #{sugg_quote.join(', ')}," +
82
+ " or '#{sugg[-1]}' ?"
83
+ end
84
+ end
85
+ msg
86
+ end
87
+ alias :dym? :did_you_mean?
88
+ # Return the name of the method that called the method
89
+ # that calls this method.
90
+ def caller_method(n = 3)
91
+ at = caller(n).first
92
+ /^(.+?):(\d+)(?::in `(.*)')?/ =~ at
93
+ :"#{Regexp.last_match[3]}"
94
+ end
95
+ alias :cm :caller_method
96
+ # Return the levensthein distance between two stringsm
97
+ # taking into account the costs of insertion, deletion,
98
+ # and substitution. Stolen from:
99
+ # http://ruby-snippets.heroku.com/string/levenshtein-distance
100
+ # Used by did_you_mean?
101
+ def levenshtein(first, other, ins=1, del=1, sub=1)
102
+ return nil if first.nil? || other.nil?
103
+ dm = []
104
+ dm[0] = (0..first.length).collect { |i| i * ins}
105
+ fill = [0] * (first.length - 1)
106
+ for i in 1..other.length
107
+ dm[i] = [i * del, fill.flatten]
108
+ end
109
+ for i in 1..other.length
110
+ for j in 1..first.length
111
+ dm[i][j] = [
112
+ dm[i-1][j-1] + (first[i-1] == other[i-1] ? 0 : sub),
113
+ dm[i][j-1] + ins,
114
+ dm[i-1][j] + del
115
+ ].min
116
+ end
117
+ end
118
+ dm[other.length][first.length]
119
+ end
120
+ end
@@ -0,0 +1,14 @@
1
+ module Treat
2
+ module Languages
3
+ class Arabic
4
+ Extractors = {}
5
+ Inflectors = {}
6
+ Lexicalizers = {
7
+ tag: [:stanford]
8
+ }
9
+ Processors = {
10
+ parsers: [:stanford]
11
+ }
12
+ end
13
+ end
14
+ end
@@ -0,0 +1,5 @@
1
+ module Treat
2
+ module Languages
3
+
4
+ end
5
+ end
@@ -0,0 +1,12 @@
1
+ module Treat
2
+ module Languages
3
+ class Chinese
4
+ Extractors = {}
5
+ Inflectors = {}
6
+ Lexicalizers = {
7
+ tag: [:stanford]
8
+ }
9
+ Processors = {}
10
+ end
11
+ end
12
+ end
@@ -0,0 +1,23 @@
1
+ module Treat
2
+ module Languages
3
+ class English
4
+ # A list of all possible word categories.
5
+ Categories = [
6
+ :adjective, :adverb, :noun, :verb, :interjection,
7
+ :clitic, :coverb, :conjunction, :determiner, :particle,
8
+ :preposition, :pronoun, :number, :symbol, :punctuation,
9
+ :complementizer
10
+ ]
11
+ wttc = {}
12
+ Treat::Languages::English::AlignedWordTags.each_slice(2) do |desc, tags|
13
+ category = desc.gsub(',', ' ,').split(' ')[0].downcase.intern
14
+ wttc[tags[0]] ||= {}; wttc[tags[1]] ||= {} ;wttc[tags[2]] ||= {}
15
+ wttc[tags[0]][:claws_5] = category
16
+ wttc[tags[1]][:brown] = category
17
+ wttc[tags[2]][:penn] = category
18
+ end
19
+ # A hash converting word tags to word categories.
20
+ WordTagToCategory = wttc
21
+ end
22
+ end
23
+ end