treat 0.1.4 → 0.2.0

Sign up to get free protection for your applications and to get access to all the features.
Files changed (160) hide show
  1. data/LICENSE +4 -4
  2. data/TODO +21 -54
  3. data/lib/economist/half_cocked_basel.txt +16 -0
  4. data/lib/economist/hose_and_dry.doc +0 -0
  5. data/lib/economist/hungarys_troubles.abw +70 -0
  6. data/lib/economist/republican_nomination.pdf +0 -0
  7. data/lib/economist/saving_the_euro.odt +0 -0
  8. data/lib/economist/to_infinity_and_beyond.txt +15 -0
  9. data/lib/economist/zero_sum.html +91 -0
  10. data/lib/treat.rb +58 -72
  11. data/lib/treat/buildable.rb +59 -15
  12. data/lib/treat/categories.rb +26 -14
  13. data/lib/treat/category.rb +2 -2
  14. data/lib/treat/delegatable.rb +65 -48
  15. data/lib/treat/doable.rb +44 -0
  16. data/lib/treat/entities.rb +34 -14
  17. data/lib/treat/entities/collection.rb +2 -0
  18. data/lib/treat/entities/document.rb +3 -2
  19. data/lib/treat/entities/entity.rb +105 -90
  20. data/lib/treat/entities/phrases.rb +17 -0
  21. data/lib/treat/entities/tokens.rb +28 -13
  22. data/lib/treat/entities/zones.rb +20 -0
  23. data/lib/treat/extractors.rb +49 -11
  24. data/lib/treat/extractors/coreferences/stanford.rb +68 -0
  25. data/lib/treat/extractors/date/chronic.rb +32 -0
  26. data/lib/treat/extractors/date/ruby.rb +25 -0
  27. data/lib/treat/extractors/keywords/tf_idf.rb +26 -0
  28. data/lib/treat/extractors/keywords/{topics_frequency.rb → topics_tf_idf.rb} +15 -7
  29. data/lib/treat/{detectors/language/language_detector.rb → extractors/language/language_extractor.rb} +5 -2
  30. data/lib/treat/extractors/language/what_language.rb +49 -0
  31. data/lib/treat/extractors/named_entity_tag/stanford.rb +53 -0
  32. data/lib/treat/extractors/roles/naive.rb +73 -0
  33. data/lib/treat/extractors/statistics/frequency_in.rb +6 -13
  34. data/lib/treat/extractors/statistics/{position_in_parent.rb → position_in.rb} +1 -1
  35. data/lib/treat/extractors/statistics/tf_idf.rb +89 -21
  36. data/lib/treat/extractors/statistics/transition_matrix.rb +11 -11
  37. data/lib/treat/extractors/statistics/transition_probability.rb +4 -4
  38. data/lib/treat/extractors/time/nickel.rb +30 -12
  39. data/lib/treat/extractors/topic_words/lda.rb +9 -9
  40. data/lib/treat/extractors/topics/reuters.rb +14 -15
  41. data/lib/treat/extractors/topics/reuters/region.xml +1 -0
  42. data/lib/treat/features.rb +7 -0
  43. data/lib/treat/formatters/readers/abw.rb +6 -1
  44. data/lib/treat/formatters/readers/autoselect.rb +5 -6
  45. data/lib/treat/formatters/readers/doc.rb +3 -1
  46. data/lib/treat/formatters/readers/html.rb +1 -1
  47. data/lib/treat/formatters/readers/image.rb +43 -0
  48. data/lib/treat/formatters/readers/odt.rb +1 -2
  49. data/lib/treat/formatters/readers/pdf.rb +9 -1
  50. data/lib/treat/formatters/readers/xml.rb +40 -0
  51. data/lib/treat/formatters/serializers/xml.rb +50 -14
  52. data/lib/treat/formatters/serializers/yaml.rb +7 -2
  53. data/lib/treat/formatters/unserializers/xml.rb +33 -7
  54. data/lib/treat/formatters/visualizers/dot.rb +90 -20
  55. data/lib/treat/formatters/visualizers/short_value.rb +2 -2
  56. data/lib/treat/formatters/visualizers/standoff.rb +2 -2
  57. data/lib/treat/formatters/visualizers/tree.rb +1 -1
  58. data/lib/treat/formatters/visualizers/txt.rb +13 -4
  59. data/lib/treat/group.rb +16 -10
  60. data/lib/treat/helpers/linguistics_loader.rb +18 -0
  61. data/lib/treat/inflectors.rb +10 -0
  62. data/lib/treat/inflectors/cardinal_words/linguistics.rb +3 -3
  63. data/lib/treat/inflectors/conjugations/linguistics.rb +5 -12
  64. data/lib/treat/inflectors/declensions/english.rb +319 -0
  65. data/lib/treat/inflectors/declensions/linguistics.rb +12 -11
  66. data/lib/treat/inflectors/ordinal_words/linguistics.rb +3 -3
  67. data/lib/treat/install.rb +59 -0
  68. data/lib/treat/kernel.rb +18 -8
  69. data/lib/treat/languages.rb +18 -11
  70. data/lib/treat/languages/arabic.rb +4 -2
  71. data/lib/treat/languages/chinese.rb +6 -2
  72. data/lib/treat/languages/dutch.rb +16 -0
  73. data/lib/treat/languages/english.rb +47 -19
  74. data/lib/treat/languages/french.rb +8 -5
  75. data/lib/treat/languages/german.rb +9 -6
  76. data/lib/treat/languages/greek.rb +16 -0
  77. data/lib/treat/languages/italian.rb +6 -3
  78. data/lib/treat/languages/polish.rb +16 -0
  79. data/lib/treat/languages/portuguese.rb +16 -0
  80. data/lib/treat/languages/russian.rb +16 -0
  81. data/lib/treat/languages/spanish.rb +16 -0
  82. data/lib/treat/languages/swedish.rb +16 -0
  83. data/lib/treat/languages/tags.rb +377 -0
  84. data/lib/treat/lexicalizers.rb +34 -23
  85. data/lib/treat/lexicalizers/category/from_tag.rb +17 -10
  86. data/lib/treat/lexicalizers/linkages/naive.rb +51 -51
  87. data/lib/treat/lexicalizers/synsets/wordnet.rb +5 -1
  88. data/lib/treat/lexicalizers/tag/brill.rb +35 -40
  89. data/lib/treat/lexicalizers/tag/lingua.rb +19 -14
  90. data/lib/treat/lexicalizers/tag/stanford.rb +59 -68
  91. data/lib/treat/lexicalizers/tag/tagger.rb +29 -0
  92. data/lib/treat/processors.rb +8 -8
  93. data/lib/treat/processors/chunkers/txt.rb +4 -4
  94. data/lib/treat/processors/parsers/enju.rb +114 -99
  95. data/lib/treat/processors/parsers/stanford.rb +109 -41
  96. data/lib/treat/processors/segmenters/punkt.rb +17 -18
  97. data/lib/treat/processors/segmenters/punkt/dutch.yaml +9716 -0
  98. data/lib/treat/processors/segmenters/punkt/english.yaml +10340 -0
  99. data/lib/treat/processors/segmenters/punkt/french.yaml +43159 -0
  100. data/lib/treat/processors/segmenters/punkt/german.yaml +9572 -0
  101. data/lib/treat/processors/segmenters/punkt/greek.yaml +6050 -0
  102. data/lib/treat/processors/segmenters/punkt/italian.yaml +14748 -0
  103. data/lib/treat/processors/segmenters/punkt/polish.yaml +9751 -0
  104. data/lib/treat/processors/segmenters/punkt/portuguese.yaml +13662 -0
  105. data/lib/treat/processors/segmenters/punkt/russian.yaml +4237 -0
  106. data/lib/treat/processors/segmenters/punkt/spanish.yaml +24034 -0
  107. data/lib/treat/processors/segmenters/punkt/swedish.yaml +10001 -0
  108. data/lib/treat/processors/segmenters/stanford.rb +38 -37
  109. data/lib/treat/processors/segmenters/tactful.rb +5 -4
  110. data/lib/treat/processors/tokenizers/macintyre.rb +7 -6
  111. data/lib/treat/processors/tokenizers/multilingual.rb +2 -3
  112. data/lib/treat/processors/tokenizers/perl.rb +2 -2
  113. data/lib/treat/processors/tokenizers/punkt.rb +6 -2
  114. data/lib/treat/processors/tokenizers/stanford.rb +25 -24
  115. data/lib/treat/processors/tokenizers/tactful.rb +1 -2
  116. data/lib/treat/proxies.rb +2 -35
  117. data/lib/treat/registrable.rb +17 -22
  118. data/lib/treat/sugar.rb +11 -11
  119. data/lib/treat/tree.rb +27 -17
  120. data/lib/treat/viewable.rb +29 -0
  121. data/lib/treat/visitable.rb +1 -1
  122. data/test/tc_entity.rb +56 -49
  123. data/test/tc_extractors.rb +41 -18
  124. data/test/tc_formatters.rb +7 -8
  125. data/test/tc_inflectors.rb +19 -24
  126. data/test/tc_lexicalizers.rb +12 -19
  127. data/test/tc_processors.rb +26 -12
  128. data/test/tc_resources.rb +2 -7
  129. data/test/tc_treat.rb +20 -22
  130. data/test/tc_tree.rb +4 -4
  131. data/test/tests.rb +3 -5
  132. data/test/texts.rb +13 -14
  133. data/tmp/INFO +1 -0
  134. metadata +78 -158
  135. data/bin/INFO +0 -1
  136. data/examples/benchmark.rb +0 -81
  137. data/examples/keywords.rb +0 -148
  138. data/lib/treat/detectors.rb +0 -31
  139. data/lib/treat/detectors/encoding/r_chardet19.rb +0 -27
  140. data/lib/treat/detectors/format/file.rb +0 -36
  141. data/lib/treat/detectors/language/what_language.rb +0 -29
  142. data/lib/treat/entities/constituents.rb +0 -15
  143. data/lib/treat/entities/sentence.rb +0 -8
  144. data/lib/treat/extractors/named_entity/abner.rb +0 -20
  145. data/lib/treat/extractors/named_entity/stanford.rb +0 -174
  146. data/lib/treat/extractors/statistics/frequency_of.rb +0 -15
  147. data/lib/treat/extractors/time/chronic.rb +0 -20
  148. data/lib/treat/extractors/time/native.rb +0 -18
  149. data/lib/treat/formatters/readers/gocr.rb +0 -26
  150. data/lib/treat/formatters/readers/ocropus.rb +0 -31
  151. data/lib/treat/formatters/visualizers/html.rb +0 -13
  152. data/lib/treat/formatters/visualizers/inspect.rb +0 -20
  153. data/lib/treat/inflectors/declensions/en.rb +0 -18
  154. data/lib/treat/languages/categories.rb +0 -5
  155. data/lib/treat/languages/english/categories.rb +0 -23
  156. data/lib/treat/languages/english/tags.rb +0 -352
  157. data/lib/treat/languages/xinhua.rb +0 -12
  158. data/lib/treat/lexicalizers/synsets/rita_wn.rb +0 -23
  159. data/lib/treat/string.rb +0 -5
  160. data/test/tc_detectors.rb +0 -26
@@ -7,31 +7,32 @@ module Treat
7
7
  #
8
8
  # Project website: http://deveiate.org/projects/Linguistics/
9
9
  class Linguistics
10
- # Require Ruby Linguistics
11
- silence_warnings { require 'linguistics' }
10
+ require 'treat/helpers/linguistics_loader'
12
11
  # Retrieve a declension of a word using the 'linguistics' gem.
13
12
  #
14
13
  # Options:
15
14
  #
16
15
  # - (Identifier) :count => :singular, :plural
17
16
  def self.declensions(entity, options = {})
18
- begin
19
- l = entity.language.to_s.upcase
20
- delegate = nil
21
- silence_warnings { delegate = ::Linguistics.const_get(l) }
22
- rescue RuntimeError
23
- raise "Ruby Linguistics does not have a module " +
24
- " installed for the #{entity.language} language."
17
+ unless options[:count]
18
+ raise Treat::Exception,
19
+ "Must supply option count (:singular or :plural)."
25
20
  end
21
+ klass = Treat::Helpers::LinguisticsLoader.load(entity.language)
26
22
  string = entity.to_s
23
+ if entity.category == :verb
24
+ raise Treat::Exception,
25
+ "Cannot retrieve the declensions of a verb. " +
26
+ "Use #singular_verb and #plural_verb instead."
27
+ end
27
28
  if options[:count] == :plural
28
29
  if entity.has?(:category) &&
29
30
  [:noun, :adjective, :verb].include?(entity.category)
30
31
  silence_warnings do
31
- delegate.send(:"plural_#{entity.category}", string)
32
+ klass.send(:"plural_#{entity.category}", string)
32
33
  end
33
34
  else
34
- silence_warnings { delegate.plural(string) }
35
+ silence_warnings { klass.plural(string) }
35
36
  end
36
37
  end
37
38
  end
@@ -7,12 +7,12 @@ module Treat
7
7
  #
8
8
  # Project website: http://deveiate.org/projects/Linguistics/
9
9
  class Linguistics
10
- # Require Ruby Linguistics.
11
- silence_warnings { require 'linguistics' }
10
+ require 'treat/helpers/linguistics_loader'
12
11
  # Desribe a number in words in ordinal form, using the
13
12
  # 'linguistics' gem.
14
13
  def self.ordinal_words(number, options = {})
15
- silence_warnings { ::Linguistics::EN.ordinate(number.to_s) }
14
+ klass = Treat::Helpers::LinguisticsLoader.load(number.language)
15
+ klass.ordinate(number.to_s)
16
16
  end
17
17
  end
18
18
  end
@@ -0,0 +1,59 @@
1
+ module Treat
2
+ class Installer
3
+ require 'rubygems/dependency_installer'
4
+ # Install required dependencies and optional dependencies
5
+ # for a specific language.
6
+ def self.install(language = :english)
7
+
8
+ lang = Treat::Languages.get(language)
9
+ required = lang::RequiredDependencies
10
+ optional = lang::OptionalDependencies
11
+
12
+ puts "Treat Installer\n\n"
13
+ puts "Installing dependencies for the #{language.to_s.capitalize} language.\n\n"
14
+
15
+ flag = false
16
+
17
+ inst = Gem::DependencyInstaller.new
18
+
19
+ required.each do |dependency|
20
+ puts "Installing required dependency '#{dependency}'..."
21
+ begin
22
+ silence_warnings { inst.install(dependency) }
23
+ rescue
24
+ flag = true
25
+ puts "Couldn't install '#{dependency}'. " +
26
+ "You need install this dependency manually by running: " +
27
+ "'gem install #{dependency}' or use 'sudo' to run this script."
28
+ end
29
+ end
30
+
31
+ optional.each do |dependency|
32
+ begin
33
+ puts "Install optional dependency '#{dependency}' (yes/no, <enter> = skip) ?"
34
+ answer = gets.strip
35
+ raise Treat::Exception unless ['yes', 'no', ''].include?(answer)
36
+ if answer == 'yes'
37
+ silence_warnings { inst.install(dependency) }
38
+ else
39
+ puts "Skipped installing '#{dependency}'."
40
+ next
41
+ end
42
+ rescue Treat::Exception
43
+ puts "Invalid input - valid options are 'yes' or 'no'."
44
+ retry
45
+ rescue
46
+ flag = true
47
+ puts "Couldn't install '#{dependency}'. " +
48
+ "You can install this dependency manually by running: " +
49
+ "'gem install #{dependency}' or use 'sudo' to run this script."
50
+ end
51
+ end
52
+
53
+ w = flag ? 'incompletely' : 'normally'
54
+ puts "\nInstall proceeded #{w}."
55
+ puts
56
+
57
+ end
58
+ end
59
+ end
@@ -7,7 +7,7 @@ module Kernel
7
7
  # A list of acronyms used in class names within
8
8
  # the program. These do not CamelCase; they
9
9
  # CAMELCase.
10
- Acronyms = ['XML', 'HTML', 'YAML', 'UEA', 'LDA', 'PDF', 'GOCR'].join('|')
10
+ Acronyms = ['XML', 'HTML', 'YAML', 'UEA', 'LDA', 'PDF'].join('|')
11
11
  # A cache to optimize camel casing.
12
12
  @@cc_cache = {}
13
13
  # A cache to optimize un camel casing.
@@ -24,15 +24,17 @@ module Kernel
24
24
  $VERBOSE = warn_level
25
25
  result
26
26
  end
27
- # Runs a block of code while blocking
28
- # stdout. Currently not implemented.
29
- def silence_streams(*streams)
27
+ # Runs a block of code while blocking stdout.
28
+ def silence_stdout(log = '/dev/null')
29
+ old = $stdout.dup
30
+ $stdout.reopen(File.new(log, 'w'))
30
31
  yield
32
+ $stdout = old
31
33
  end
32
34
  # Create a temporary file which is deleted
33
35
  # after execution of the block.
34
36
  def create_temp_file(ext, value = nil, &block)
35
- fname = "../tmp/#{Random.rand(10000000).to_s}.#{ext}"
37
+ fname = "#{Treat.lib}/../tmp/#{Random.rand(10000000).to_s}.#{ext}"
36
38
  File.open(fname, 'w') do |f|
37
39
  f.write(value) if value
38
40
  block.call(f.path)
@@ -40,11 +42,19 @@ module Kernel
40
42
  ensure
41
43
  File.delete(fname)
42
44
  end
45
+ # Create a temporary directory.
46
+ def create_temp_dir(&block)
47
+ dname = "#{Treat.lib}/../tmp/#{Random.rand(10000000).to_s}"
48
+ Dir.mkdir(dname)
49
+ block.call(dname)
50
+ ensure
51
+ FileUtils.rm_rf(dname)
52
+ end
43
53
  # Convert un_camel_case to CamelCase.
44
54
  def camel_case(o_phrase)
45
55
  phrase = o_phrase.to_s.dup
46
56
  return @@cc_cache[o_phrase] if @@cc_cache[o_phrase]
47
- phrase.gsub!(/#{Acronyms.downcase}[^a-z]+/) { |a| a.upcase }
57
+ phrase.gsub!(/#{Acronyms.downcase}[^a-z]*/) { |a| a.upcase }
48
58
  phrase.gsub!(/^[a-z]|_[a-z]/) { |a| a.upcase }
49
59
  phrase.gsub!('_', '')
50
60
  @@cc_cache[o_phrase] = phrase
@@ -77,7 +87,7 @@ module Kernel
77
87
  sugg << element
78
88
  end
79
89
  end
80
- unless sugg.empty?
90
+ unless sugg.size == 0
81
91
  if sugg.size == 1
82
92
  msg += " Perhaps you meant '#{sugg[0]}' ?"
83
93
  else
@@ -94,7 +104,7 @@ module Kernel
94
104
  def caller_method(n = 3)
95
105
  at = caller(n).first
96
106
  /^(.+?):(\d+)(?::in `(.*)')?/ =~ at
97
- :"#{Regexp.last_match[3]}"
107
+ Regexp.last_match[3].intern
98
108
  end
99
109
  alias :cm :caller_method
100
110
  # Return the levensthein distance between two stringsm
@@ -12,7 +12,7 @@ module Treat
12
12
  # or its full text description in full French or English.
13
13
  def self.describe(lang, desc_lang = :en)
14
14
  raise "Must provide a non-nil language identifier to describe." if lang.nil?
15
- lang = find(lang).to_s
15
+ lang = code(lang).to_s
16
16
  if [:en, :eng, :english, :anglais].include?(desc_lang)
17
17
  l = @@english_full.key(lang)
18
18
  elsif [:fr, :fra, :french, :french].include?(desc_lang)
@@ -42,22 +42,22 @@ module Treat
42
42
  # or full name (in English or French) and return
43
43
  # the ISO-639-1 or ISO-639-2 language code as a
44
44
  # lowercase identifier.
45
- def self.find(lang, rc = ISO639_2)
45
+ def self.code(lang, rc = ISO639_2)
46
46
  raise "Must provide a non-nil language identifier to describe." if lang.nil?
47
47
  get_languages
48
48
  lang = lang.to_s.downcase
49
49
  if @@iso639_1.has_key?(lang)
50
- return :"#{lang}" if rc == ISO639_1
51
- return :"#{@@iso639_1[lang]}" if rc == ISO639_2
50
+ return lang.intern if rc == ISO639_1
51
+ return @@iso639_1[lang].intern if rc == ISO639_2
52
52
  elsif @@iso639_2.has_key?(lang)
53
- return :"#{lang}" if rc == ISO639_2
54
- return :"#{@@iso639_2[lang]}" if rc == ISO639_1
53
+ return lang.intern if rc == ISO639_2
54
+ return @@iso639_2[lang].intern if rc == ISO639_1
55
55
  elsif @@english_full.has_key?(lang)
56
- return :"#{@@english_full[lang]}" if rc == ISO639_2
57
- return :"#{@@iso639_2[@@english_full[lang]]}" if rc == ISO639_1
56
+ return @@english_full[lang].intern if rc == ISO639_2
57
+ return @@iso639_2[@@english_full[lang]].intern if rc == ISO639_1
58
58
  elsif @@french_full.has_key?(lang)
59
- return :"#{@@french_full[lang]}" if rc == ISO639_2
60
- return :"#{@@iso639_1[@@french_full[lang]]}" if rc == ISO639_2
59
+ return @@french_full[lang].intern if rc == ISO639_2
60
+ return @@iso639_1[@@french_full[lang]].intern if rc == ISO639_2
61
61
  else
62
62
  not_found(lang)
63
63
  end
@@ -87,5 +87,12 @@ module Treat
87
87
  end
88
88
  @@loaded = true
89
89
  end
90
+ # A list of all possible word categories.
91
+ WordCategories = [
92
+ :adjective, :adverb, :noun, :verb, :interjection,
93
+ :clitic, :coverb, :conjunction, :determiner, :particle,
94
+ :preposition, :pronoun, :number, :symbol, :punctuation,
95
+ :complementizer
96
+ ]
90
97
  end
91
- end
98
+ end
@@ -1,13 +1,15 @@
1
1
  module Treat
2
2
  module Languages
3
3
  class Arabic
4
+ RequiredDependencies = []
5
+ OptionalDependencies = []
4
6
  Extractors = {}
5
7
  Inflectors = {}
6
8
  Lexicalizers = {
7
- tag: [:stanford]
9
+ :tag => [:stanford]
8
10
  }
9
11
  Processors = {
10
- parsers: [:stanford]
12
+ :parsers => [:stanford]
11
13
  }
12
14
  end
13
15
  end
@@ -1,12 +1,16 @@
1
1
  module Treat
2
2
  module Languages
3
3
  class Chinese
4
+ RequiredDependencies = []
5
+ OptionalDependencies = []
4
6
  Extractors = {}
5
7
  Inflectors = {}
6
8
  Lexicalizers = {
7
- tag: [:stanford]
9
+ :tag => [:stanford]
10
+ }
11
+ Processors = {
12
+ :parsers => [:stanford]
8
13
  }
9
- Processors = {}
10
14
  end
11
15
  end
12
16
  end
@@ -0,0 +1,16 @@
1
+ module Treat
2
+ module Languages
3
+ class Dutch
4
+ RequiredDependencies = []
5
+ OptionalDependencies = []
6
+ Processors = {
7
+ :chunkers => [:txt],
8
+ :segmenters => [:punkt],
9
+ :tokenizers => [:tactful]
10
+ }
11
+ Extractors = {}
12
+ Inflectors = {}
13
+ Lexicalizers = {}
14
+ end
15
+ end
16
+ end
@@ -2,34 +2,62 @@ module Treat
2
2
  module Languages
3
3
  class English
4
4
 
5
- require 'treat/languages/english/tags'
6
- require 'treat/languages/english/categories'
5
+ RequiredDependencies = ['rbtagger', 'ruby-stemmer', 'tactful_tokenizer', 'nickel', 'wordnet']
6
+ OptionalDependencies = ['uea-stemmer', 'tokenizer', 'engtagger']
7
7
 
8
8
  Extractors = {
9
- time: [:chronic],
10
- topics: [:reuters],
11
- topic_words: [:lda],
12
- key_sentences: [:topics_frequency]
9
+ :time => [:nickel],
10
+ :date => [:chronic, :ruby],
11
+ :topics => [:reuters],
12
+ :topic_words => [:lda],
13
+ :keywords => [:tf_idf, :topics_tf_idf],
14
+ :named_entity_tag => [:stanford],
15
+ :coreferences => [:stanford],
16
+ :roles => [:naive]
13
17
  }
18
+
14
19
  Processors = {
15
- chunkers: [:txt],
16
- parsers: [:stanford, :enju],
17
- segmenters: [:tactful, :punkt, :stanford],
18
- tokenizers: [:multilingual, :macintyre, :perl, :punkt, :tactful, :stanford]
20
+ :chunkers => [:txt],
21
+ :parsers => [:stanford, :enju],
22
+ :segmenters => [:tactful, :punkt, :stanford],
23
+ :tokenizers => [:macintyre, :multilingual, :perl, :punkt, :stanford, :tactful]
19
24
  }
25
+
20
26
  Lexicalizers = {
21
- category: [:from_tag],
22
- linkages: [:naive],
23
- synsets: [:wordnet, :rita_wn],
24
- tag: [:brill, :lingua, :stanford]
27
+ :category => [:from_tag],
28
+ :linkages => [:naive],
29
+ :synsets => [:wordnet],
30
+ :tag => [:brill, :lingua, :stanford]
25
31
  }
32
+
26
33
  Inflectors = {
27
- conjugations: [:linguistics],
28
- declensions: [:linguistics, :english],
29
- stem: [:porter_c, :porter, :uea],
30
- ordinal_words: [:linguistics],
31
- cardinal_words: [:linguistics]
34
+ :conjugations => [:linguistics],
35
+ :declensions => [:english, :linguistics],
36
+ :stem => [:porter, :porter_c, :uea],
37
+ :ordinal_words => [:linguistics],
38
+ :cardinal_words => [:linguistics]
32
39
  }
40
+
41
+ CommonWords = [
42
+ 'the', 'of', 'and', 'a', 'to', 'in', 'is',
43
+ 'you', 'that', 'it', 'he', 'was', 'for', 'on',
44
+ 'are', 'as', 'with', 'his', 'they', 'I', 'at',
45
+ 'be', 'this', 'have', 'from', 'or', 'one', 'had',
46
+ 'by', 'word', 'but', 'not', 'what', 'all', 'were',
47
+ 'we', 'when', 'your', 'can', 'said', 'there', 'use',
48
+ 'an', 'each', 'which', 'she', 'do', 'how', 'their',
49
+ 'if', 'will', 'up', 'other', 'about', 'out', 'many',
50
+ 'then', 'them', 'these', 'so', 'some', 'her', 'would',
51
+ 'make', 'like', 'him', 'into', 'time', 'has', 'look',
52
+ 'two', 'more', 'write', 'go', 'see', 'number', 'no',
53
+ 'way', 'could', 'people', 'my', 'than', 'first', 'been',
54
+ 'call', 'who', 'its', 'now', 'find', 'long', 'down',
55
+ 'day', 'did', 'get', 'come', 'made', 'may', 'part',
56
+ 'say', 'also', 'new', 'much', 'should', 'still',
57
+ 'such', 'before', 'after', 'other', 'then', 'over',
58
+ 'under', 'therefore', 'nonetheless', 'thereafter',
59
+ 'afterwards', 'here', 'huh', 'hah', "'nt", "'t", 'here'
60
+ ]
33
61
 
34
62
  end
35
63
  end
@@ -1,16 +1,19 @@
1
1
  module Treat
2
2
  module Languages
3
3
  class French
4
+ RequiredDependencies = []
5
+ OptionalDependencies = []
4
6
  Extractors = {}
5
7
  Inflectors = {}
6
8
  Lexicalizers = {
7
- tag: [:stanford]
9
+ :tag => [:stanford],
10
+ :category => [:from_tag]
8
11
  }
9
12
  Processors = {
10
- chunkers: [:txt],
11
- parsers: [:stanford],
12
- segmenters: [:tactful, :punkt, :stanford],
13
- tokenizers: [:multilingual, :macintyre, :perl, :punkt, :tactful, :stanford]
13
+ :chunkers => [:txt],
14
+ :parsers => [:stanford],
15
+ :segmenters => [:punkt],
16
+ :tokenizers => [:tactful]
14
17
  }
15
18
  end
16
19
  end
@@ -1,17 +1,20 @@
1
1
  module Treat
2
2
  module Languages
3
3
  class German
4
+ RequiredDependencies = []
5
+ OptionalDependencies = []
4
6
  Extractors = {}
5
7
  Inflectors = {}
6
8
  Lexicalizers = {
7
- tag: [:stanford]
9
+ :tag => [:stanford],
10
+ :category => [:from_tag]
8
11
  }
9
12
  Processors = {
10
- chunkers: [:txt],
11
- parsers: [:stanford],
12
- segmenters: [:tactful, :punkt, :stanford],
13
- tokenizers: [:multilingual, :macintyre, :perl, :punkt, :tactful, :stanford]
13
+ :chunkers => [:txt],
14
+ :parsers => [:stanford],
15
+ :segmenters => [:punkt],
16
+ :tokenizers => [:tactful]
14
17
  }
15
18
  end
16
19
  end
17
- end
20
+ end
@@ -0,0 +1,16 @@
1
+ module Treat
2
+ module Languages
3
+ class Greek
4
+ RequiredDependencies = []
5
+ OptionalDependencies = []
6
+ Processors = {
7
+ :chunkers => [:txt],
8
+ :segmenters => [:punkt],
9
+ :tokenizers => [:tactful]
10
+ }
11
+ Extractors = {}
12
+ Inflectors = {}
13
+ Lexicalizers = {}
14
+ end
15
+ end
16
+ end