treat 0.1.1 → 0.1.2

Sign up to get free protection for your applications and to get access to all the features.
Files changed (81) hide show
  1. data/INSTALL +1 -0
  2. data/README +3 -0
  3. data/TODO +14 -26
  4. data/bin/INFO +1 -1
  5. data/lib/treat/buildable.rb +10 -11
  6. data/lib/treat/categories.rb +8 -6
  7. data/lib/treat/category.rb +7 -2
  8. data/lib/treat/delegatable.rb +64 -56
  9. data/lib/treat/detectors/encoding/r_chardet19.rb +1 -1
  10. data/lib/treat/detectors/language/language_detector.rb +2 -1
  11. data/lib/treat/detectors/language/what_language.rb +2 -2
  12. data/lib/treat/detectors.rb +3 -0
  13. data/lib/treat/entities/entity.rb +1 -1
  14. data/lib/treat/entities.rb +9 -10
  15. data/lib/treat/exception.rb +3 -1
  16. data/lib/treat/extractors/named_entity/abner.rb +1 -1
  17. data/lib/treat/extractors/named_entity/stanford.rb +2 -2
  18. data/lib/treat/extractors/time/chronic.rb +2 -2
  19. data/lib/treat/extractors/time/nickel.rb +2 -2
  20. data/lib/treat/extractors/topic_words/lda.rb +2 -2
  21. data/lib/treat/extractors.rb +12 -9
  22. data/lib/treat/feature.rb +6 -1
  23. data/lib/treat/formatters/cleaners/html.rb +1 -1
  24. data/lib/treat/formatters.rb +8 -8
  25. data/lib/treat/group.rb +11 -10
  26. data/lib/treat/inflectors/cardinal_words/linguistics.rb +3 -3
  27. data/lib/treat/inflectors/{conjugators → conjugations}/linguistics.rb +6 -6
  28. data/lib/treat/inflectors/{declensors → declensions}/en.rb +2 -2
  29. data/lib/treat/inflectors/{declensors → declensions}/linguistics.rb +5 -5
  30. data/lib/treat/inflectors/ordinal_words/linguistics.rb +4 -4
  31. data/lib/treat/inflectors/{stemmers → stem}/porter.rb +1 -1
  32. data/lib/treat/inflectors/{stemmers → stem}/porter_c.rb +3 -3
  33. data/lib/treat/inflectors/{stemmers → stem}/uea.rb +3 -3
  34. data/lib/treat/inflectors.rb +8 -21
  35. data/lib/treat/kernel.rb +120 -0
  36. data/lib/treat/languages/arabic.rb +14 -0
  37. data/lib/treat/languages/categories.rb +5 -0
  38. data/lib/treat/languages/chinese.rb +12 -0
  39. data/lib/treat/languages/english/categories.rb +23 -0
  40. data/lib/treat/{resources → languages/english}/tags.rb +127 -184
  41. data/lib/treat/languages/english.rb +33 -0
  42. data/lib/treat/languages/french.rb +17 -0
  43. data/lib/treat/languages/german.rb +17 -0
  44. data/lib/treat/languages/italian.rb +14 -0
  45. data/lib/treat/{resources/languages.txt → languages/list.txt} +0 -0
  46. data/lib/treat/languages/xinhua.rb +12 -0
  47. data/lib/treat/languages.rb +91 -0
  48. data/lib/treat/lexicalizers/category/from_tag.rb +20 -8
  49. data/lib/treat/lexicalizers/synsets/rita_wn.rb +1 -1
  50. data/lib/treat/lexicalizers/tag/brill.rb +2 -1
  51. data/lib/treat/lexicalizers/tag/lingua.rb +2 -1
  52. data/lib/treat/lexicalizers/tag/stanford.rb +16 -15
  53. data/lib/treat/lexicalizers.rb +1 -1
  54. data/lib/treat/object.rb +6 -0
  55. data/lib/treat/processors/parsers/enju.rb +3 -2
  56. data/lib/treat/processors/parsers/stanford.rb +15 -12
  57. data/lib/treat/processors/segmenters/punkt.rb +1 -1
  58. data/lib/treat/processors/segmenters/stanford.rb +7 -5
  59. data/lib/treat/processors/segmenters/tactful.rb +1 -1
  60. data/lib/treat/processors/tokenizers/multilingual.rb +2 -2
  61. data/lib/treat/processors/tokenizers/stanford.rb +7 -5
  62. data/lib/treat/visitable.rb +2 -1
  63. data/lib/treat.rb +105 -54
  64. data/test/tc_entity.rb +5 -0
  65. data/test/tc_resources.rb +5 -5
  66. data/test/tc_treat.rb +1 -2
  67. data/test/tests.rb +2 -1
  68. metadata +63 -64
  69. data/lib/treat/formatters/serializers/yaml/helper.rb +0 -96
  70. data/lib/treat/inflectors/lemmatizers/e_lemma/Makefile +0 -213
  71. data/lib/treat/inflectors/lemmatizers/e_lemma/elemma.c +0 -68
  72. data/lib/treat/inflectors/lemmatizers/e_lemma/extconf.rb +0 -6
  73. data/lib/treat/inflectors/lemmatizers/e_lemma.rb +0 -12
  74. data/lib/treat/resources/categories.rb +0 -18
  75. data/lib/treat/resources/delegates.rb +0 -96
  76. data/lib/treat/resources/dependencies.rb +0 -0
  77. data/lib/treat/resources/edges.rb +0 -8
  78. data/lib/treat/resources/formats.rb +0 -23
  79. data/lib/treat/resources/languages.rb +0 -86
  80. data/lib/treat/resources.rb +0 -10
  81. data/lib/treat/utilities.rb +0 -127
data/INSTALL CHANGED
@@ -0,0 +1 @@
1
+ See the Wiki: https://github.com/louismullie/treat/wiki/Installing-Treat
data/README CHANGED
@@ -0,0 +1,3 @@
1
+ Treat - Text Retrieval and Annotation Toolkit
2
+
3
+ See the wiki for more information at https://github.com/louismullie/treat/wiki/.
data/TODO CHANGED
@@ -1,23 +1,29 @@
1
- == Before first release
1
+ ## Urgent
2
2
 
3
- - Linkers, inflectors
3
+ - Linkers
4
4
  - Check taggers for context
5
5
  - Stanford dependencies parse
6
- - Enju : make sure Enju parses constituents, remove_last, abbr
6
+ - Enju: test
7
7
  - Ocropus => use better function
8
- - Optimize magic methods... bigger or smaller, is_token? type methods, phrase categories.
8
+ - Optimize magic methods... is_token? type methods, phrase categories.
9
9
  - Move statistics?
10
10
  - Synset class move
11
11
  - general procedure for options, check that user doesn't want to change options...
12
- - Resources: dependencies vs. edges, PTB function tags
12
+ - Languages: dependencies vs. edges, PTB function tags
13
13
  - Check for # Fix everywhere
14
14
  - Check paths; parse bin paths
15
15
  - Ferret, Natural Inputs
16
16
  - Use consistently delegate
17
17
  - Text becomes section
18
+ - Remove top level
19
+ - Loading multiple JARs
20
+ - Tokenized sentences are not parsed
21
+ - Documentation
22
+ - Remove feature
18
23
 
19
- === Eventually
24
+ ## Eventually
20
25
 
26
+ - English inflector
21
27
  - RDF output
22
28
  - Apache OpenNLP
23
29
  - Ariel
@@ -44,24 +50,6 @@
44
50
  - Probabilistic features: rchardet19, what_language
45
51
  - Enju multithreading ?
46
52
  - String type detector for other languages
47
-
48
53
  - Automatic benchmark
49
-
50
- === Distant future
51
-
52
- - Spell Cheker Raspell
53
- - Multithreading
54
-
55
- === Checklist before releasing
56
-
57
- - Remove code from main page
58
- - Remove lib path from tests and main page
59
- - Remove docs
60
- - gem19 build treat.gemspec
61
-
62
- === Performance
63
-
64
- - Cache results that get computed often
65
- - Use .size == 0 instead of .empty?
66
- - Optimize method_missing using define_method, even dynamically
67
- - Array include is slow
54
+ - Raspell spell checker
55
+ - Multithreading
data/bin/INFO CHANGED
@@ -1 +1 @@
1
- This is where you can put your JAR files.
1
+ This is where Treat will look for the Stanford JAR files by default. You can change this to another directory by setting Treat.bin = '/path/to/your/folder/' at runtime.
@@ -1,6 +1,9 @@
1
1
  module Treat
2
+ # Represents an object that can be built
3
+ # from a folder of files, a specific file,
4
+ # a string or a numeric object. This class
5
+ # is pretty much self-explanatory.
2
6
  module Buildable
3
-
4
7
  def from_anything(file_or_value, id)
5
8
  if File.readable?(file_or_value.to_s)
6
9
  from_file(file_or_value)
@@ -9,11 +12,11 @@ module Treat
9
12
  elsif file_or_value.is_a?(Numeric)
10
13
  from_numeric(file_or_value)
11
14
  else
12
- raise "Unrecognizable input #{file_or_value}. "+
15
+ raise Treat::Exception,
16
+ "Unrecognizable input #{file_or_value}. "+
13
17
  "Use filename, folder, text or a number."
14
18
  end
15
19
  end
16
-
17
20
  def from_string(string)
18
21
  if self == Treat::Entities::Document ||
19
22
  self == Treat::Entities::Collection
@@ -37,7 +40,6 @@ module Treat
37
40
  end
38
41
  return Treat::Entities::Unknown.new(string)
39
42
  end
40
-
41
43
  def from_numeric(numeric)
42
44
  unless self == Treat::Entities::Number
43
45
  raise Treat::Exception,
@@ -46,7 +48,6 @@ module Treat
46
48
  end
47
49
  Treat::Entities::Number.new(numeric.to_s)
48
50
  end
49
-
50
51
  def from_folder(folder)
51
52
  unless FileTest.directory?(folder)
52
53
  raise Treat::Exception,
@@ -69,7 +70,6 @@ module Treat
69
70
  end
70
71
  c
71
72
  end
72
-
73
73
  def from_file(file)
74
74
  unless File.readable?(file)
75
75
  raise Treat::Exception,
@@ -79,7 +79,8 @@ module Treat
79
79
  from_folder(file)
80
80
  else
81
81
  ext = file.split('.')[-1]
82
- ext = 'yaml' if ext == 'yml' # Humanize the yml extension.
82
+ # Humanize the yaml extension.
83
+ ext = 'yaml' if ext == 'yml'
83
84
  if Treat::Formatters::Unserializers.list.
84
85
  include?(ext.downcase.intern)
85
86
  from_serialized_file(file)
@@ -88,7 +89,6 @@ module Treat
88
89
  end
89
90
  end
90
91
  end
91
-
92
92
  def from_raw_file(file)
93
93
  unless self == Treat::Entities::Document
94
94
  raise Treat::Exception,
@@ -98,9 +98,9 @@ module Treat
98
98
  d = Treat::Entities::Document.new(file)
99
99
  d.read
100
100
  end
101
-
102
101
  def from_serialized_file(file)
103
- unless [Treat::Entities::Document, Treat::Entities::Collection].include?(self)
102
+ unless [Treat::Entities::Document,
103
+ Treat::Entities::Collection].include?(self)
104
104
  raise Treat::Exception,
105
105
  "Cannot create something else than a " +
106
106
  "document from raw file '#{file}'."
@@ -110,6 +110,5 @@ module Treat
110
110
  d.children[0].set_as_root!
111
111
  d.children[0]
112
112
  end
113
-
114
113
  end
115
114
  end
@@ -1,16 +1,18 @@
1
1
  module Treat
2
+ # This module keeps track of all categories that
3
+ # exist and the methods they implement, and is
4
+ # responsible for including the categories.
2
5
  module Categories
3
- # Modify the module that includes Category to
4
- # setup autoload, delegators and provide a list
5
- # of methods.
6
+ # A list of categories.
6
7
  class << self; attr_accessor :list; end
7
8
  self.list = []
8
- # Boolean - does any of the categories
9
- # groups respond to the symbol.
9
+ # Boolean - does any of the categories have
10
+ # a method that corresponds to sym?
10
11
  def self.have_method?(sym); methods.include?(sym); end
12
+ # Cache the list of methods once it has been computed.
13
+ @@methods = []
11
14
  # Provide a list of all methods implemented
12
15
  # by all Treat categories.
13
- @@methods = []
14
16
  def self.methods
15
17
  return @@methods unless @@methods.empty?
16
18
  self.list.each do |ns|
@@ -1,7 +1,11 @@
1
1
  module Treat
2
- # Provides functions common to all algorithm categories.
2
+ # Clusters together groups of algorithms that
3
+ # perform similar functions.
3
4
  module Category
5
+ # Require the Group class.
4
6
  require 'treat/group'
7
+ # Add delegators to the Entities based on the
8
+ # configuration for a given category.
5
9
  def self.extended(category)
6
10
  Treat::Categories.list << category
7
11
  category.module_eval do
@@ -14,9 +18,10 @@ module Treat
14
18
  end
15
19
  end
16
20
  end
21
+ # Provides a list of groups within this category.
17
22
  def groups; self.constants; end
18
23
  # Provide a list of methods implemented in
19
- # the groups contained within that
24
+ # the groups contained within this category.
20
25
  def methods
21
26
  methods = []
22
27
  groups.each do |group|
@@ -2,34 +2,8 @@ module Treat
2
2
  # Makes a class delegatable, allowing calls on it to be forwarded
3
3
  # to a delegate class performing the appropriate call.
4
4
  module Delegatable
5
-
6
- # Get the default delegate for that language
7
- # inside the given group.
8
- def get_language_delegate(language, group)
9
- lang = Treat::Resources::Languages.describe(language)
10
- lclass = cc(lang).intern
11
- if Treat::Resources::Delegates.constants.include?(lclass)
12
- cat = group.to_s.split('::')[-2].intern
13
- lclass = Treat::Resources::Delegates.
14
- const_get(lclass).const_get(cat)
15
- g = ucc(cl(group)).intern
16
- if !lclass[g] || !lclass[g][0]
17
- d = ucc(cl(group))
18
- d.gsub!('_', ' ')
19
- d = d[0..-2] if d[-1] == 's'
20
- d = 'delegator to find ' + d
21
- raise Treat::Exception, "No #{d}" +
22
- " is available for the #{lang} language."
23
- end
24
- return lclass[g][0]
25
- else
26
- raise Treat::Exception,
27
- "Language '#{lang}' is not supported (yet)."
28
- end
29
- end
30
-
31
5
  # Add decorator methods to entities.
32
- def decorate(group, m)
6
+ def add_decorators(group, m)
33
7
  decorators = group.methods -
34
8
  Object.methods -
35
9
  [:type, :type=, :targets, :targets=,
@@ -42,49 +16,83 @@ module Treat
42
16
  end
43
17
  end
44
18
  end
45
-
46
- # Raise an exception and suggest alternatives.
47
- def delegate_not_found(klass, group)
48
- "Algorithm '#{ucc(klass)}' couldn't be found in group #{group}." +
49
- did_you_mean?(group.list.map { |c| ucc(c) }, ucc(klass))
50
- end
51
-
52
19
  # Add delegator group to all entities of a class.
53
20
  def add_delegators(group)
54
21
  # Define each method in group.
55
22
  self.class_eval do
56
23
  m = group.method
57
- decorate(group, m)
24
+ add_decorators(group, m)
58
25
  define_method(m) do |delegate=nil, options={}|
59
26
  decorator = options.delete(:decorator)
60
27
  puts self.id if !@features
61
28
  if !@features[m].nil?
62
29
  @features[m]
63
30
  else
64
- if delegate.nil?
65
- delegate = group.default.nil? ?
66
- self.class.get_language_delegate(language, group) :
67
- group.default
68
- raise "No default delegate for #{group}." if delegate == :none
69
- end
70
- if not group.list.include?(delegate)
71
- raise Treat::Exception,
72
- self.class.delegate_not_found(delegate, group)
73
- else
74
- delegate_klass = group.const_get(:"#{cc(delegate.to_s)}")
75
- result = accept(group, delegate_klass, m, options)
76
- if decorator
77
- result = group.send(decorator, self, result)
78
- end
79
- if group.type == :annotator
80
- f = decorator.nil? ? m : decorator
81
- @features[f] = result
82
- end
83
- result
84
- end
31
+ self.class.call_delegator(
32
+ self, m, delegate, decorator,
33
+ group, options)
85
34
  end
86
35
  end
87
36
  end
88
37
  end
38
+ # Call a delegator.
39
+ def call_delegator(entity, m, delegate, decorator, group, options)
40
+ if delegate.nil?
41
+ delegate = get_missing_delegate(entity, group)
42
+ end
43
+ if not group.list.include?(delegate)
44
+ raise Treat::Exception, delegate_not_found(delegate, group)
45
+ else
46
+ delegate_klass = group.const_get(:"#{cc(delegate.to_s)}")
47
+ result = entity.accept(group, delegate_klass, m, options)
48
+ if decorator
49
+ result = group.send(decorator, self, result)
50
+ end
51
+ if group.type == :annotator
52
+ f = decorator.nil? ? m : decorator
53
+ entity.features[f] = result
54
+ end
55
+ result
56
+ end
57
+ end
58
+ # Get the default delegate for that language
59
+ # inside the given group.
60
+ def get_language_delegate(language, group)
61
+ lang = Treat::Languages.describe(language)
62
+ lclass = cc(lang).intern
63
+ if Treat::Languages.constants.include?(lclass)
64
+ cat = group.to_s.split('::')[-2].intern
65
+ lclass = Treat::Languages.const_get(lclass).const_get(cat)
66
+ g = ucc(cl(group)).intern
67
+ if !lclass[g] || !lclass[g][0]
68
+ d = ucc(cl(group))
69
+ d.gsub!('_', ' ')
70
+ d = d[0..-2] if d[-1] == 's'
71
+ d = 'delegator to find ' + d
72
+ raise Treat::Exception, "No #{d}" +
73
+ " is available for the #{lang} language."
74
+ end
75
+ return lclass[g][0]
76
+ else
77
+ raise Treat::Exception,
78
+ "Language '#{lang}' is not supported (yet)."
79
+ end
80
+ end
81
+ # Get which delegate to use if none has been supplied.
82
+ def get_missing_delegate(entity, group)
83
+ delegate = group.default.nil? ?
84
+ self.get_language_delegate(entity.language, group) :
85
+ group.default
86
+ if delegate == :none
87
+ raise NAT::Exception,
88
+ "There is intentionally no default delegate for #{group}."
89
+ end
90
+ delegate
91
+ end
92
+ # Return an error message and suggest possible typos.
93
+ def delegate_not_found(klass, group)
94
+ "Algorithm '#{ucc(klass)}' couldn't be found in group #{group}." +
95
+ did_you_mean?(group.list.map { |c| ucc(c) }, ucc(klass))
96
+ end
89
97
  end
90
98
  end
@@ -2,7 +2,7 @@ module Treat
2
2
  module Detectors
3
3
  module Encoding
4
4
  # Require the 'rchardet19' gem.
5
- silently { require 'rchardet19' }
5
+ silence_warnings { require 'rchardet19' }
6
6
  # A wrapper for the 'rchardet19' gem, which
7
7
  # detects the encoding of a file.
8
8
  class RChardet19
@@ -9,7 +9,8 @@ module Treat
9
9
  dlvl = Treat.language_detection_level
10
10
  if (Entities.rank(entity.type) < Entities.rank(dlvl)) &&
11
11
  entity.has_parent?
12
- return entity.ancestor_with_type(dlvl).language
12
+ anc = entity.ancestor_with_type(dlvl)
13
+ return anc.language if anc
13
14
  end
14
15
  end
15
16
  end
@@ -2,7 +2,7 @@ module Treat
2
2
  module Detectors
3
3
  module Language
4
4
  # Require the 'whatlanguage' gem.
5
- silently { require 'whatlanguage' }
5
+ silence_warnings { require 'whatlanguage' }
6
6
  # Adaptor for the 'whatlanguage' gem, which
7
7
  # performs probabilistic language detection.
8
8
  class WhatLanguage < LanguageDetector
@@ -19,7 +19,7 @@ module Treat
19
19
  all = @@wl.process_text(entity.to_s)
20
20
  lang = {}
21
21
  all.each do |k,v|
22
- lang[Treat::Resources::Languages.find(k)] = v
22
+ lang[Treat::Languages.find(k)] = v
23
23
  end
24
24
  Treat::Feature.new(lang).best
25
25
  end
@@ -1,6 +1,9 @@
1
1
  module Treat
2
2
  # Detectors detect a specific meta-information about
3
3
  # an entity, such as encoding, format and language.
4
+ #
5
+ # Detectors are language-independent, and thus they
6
+ # are default algorithms specified for each of them.
4
7
  module Detectors
5
8
  # Group for algorithms that detect encoding.
6
9
  module Encoding
@@ -93,7 +93,7 @@ module Treat
93
93
  # dispatches done by Ruby to improve performance.
94
94
  def parse_magic_method(sym, *args, &block)
95
95
  @@entities_regexp ||= "(#{Treat::Entities.list.join('|')})"
96
- @@cats_regexp ||= "(#{Treat::Resources::Categories::List.join('|')})"
96
+ @@cats_regexp ||= "(#{Treat::Languages::English::Categories.join('|')})"
97
97
  method = sym.to_s =~ /entities/ ?
98
98
  sym.to_s.gsub('entities', 'entitys'):
99
99
  method = sym.to_s
@@ -4,15 +4,14 @@ module Treat
4
4
  #
5
5
  # - Collection
6
6
  # - Document
7
- # - Text
8
7
  # - Zone (a Section, Title, Paragraph, or List)
9
8
  # - Sentence
10
9
  # - Constituent (a Phrase or Clause)
11
10
  # - Token (a Word, Number, Punctuation, or Symbol).
12
11
  module Entities
13
- # Require Entity first, since the other classes
14
- # extend this class.
12
+ # Require Entity first.
15
13
  require 'treat/entities/entity'
14
+ # Then require all possible entities.
16
15
  require 'treat/entities/collection'
17
16
  require 'treat/entities/document'
18
17
  require 'treat/entities/text'
@@ -40,13 +39,13 @@ module Treat
40
39
  # comparison of entity types.
41
40
  def self.rank(type)
42
41
  klass = Entities.const_get(cc(type))
43
- return 6 if klass == Collection || klass < Collection
44
- return 5 if klass == Document || klass < Document
45
- return 4 if klass == Text || klass < Text
46
- return 3 if klass == Zone || klass < Zone
47
- return 2 if klass == Sentence || klass < Sentence
48
- return 1 if klass == Constituent || klass < Constituent
49
- return 0 if klass == Token || klass < Token
42
+ compare = lambda { |a,b| a == b || a < b }
43
+ return 0 if compare.call(klass, Token)
44
+ return 1 if compare.call(klass, Constituent)
45
+ return 2 if compare.call(klass, Sentence)
46
+ return 4 if compare.call(klass, Document)
47
+ return 3 if compare.call(klass, Section)
48
+ return 5 if compare.call(klass, Collection)
50
49
  end
51
50
  end
52
51
  end
@@ -1,5 +1,7 @@
1
1
  module Treat
2
- # Exception class for the Treat library.
2
+ # Custom exception class for the Treat toolkit.
3
+ # Used to distinguish between errors raised by
4
+ # gems or Ruby from errors raised by the toolkit.
3
5
  class Exception < ::Exception
4
6
  end
5
7
  end
@@ -3,7 +3,7 @@ module Treat
3
3
  module NamedEntity
4
4
  class Abner
5
5
  # Require the Ruby-Java bridge.
6
- silently do
6
+ silence_warnings do
7
7
  require 'rjb'
8
8
  Rjb::load('', ['-Xms256M', '-Xmx512M'])
9
9
  puts Rjb.import('tagger')
@@ -3,7 +3,7 @@ module Treat
3
3
  module NamedEntity
4
4
  class Stanford
5
5
  # Require the Ruby-Java bridge.
6
- silently do
6
+ silence_warnings do
7
7
  require 'rjb'
8
8
  Rjb::load(nil, ['-Xms256M', '-Xmx1024M'])
9
9
  Rjb::add_jar('/ruby/treat/bin/treat/treat.jar')
@@ -23,7 +23,7 @@ module Treat
23
23
  properties.set_property('ner.model.3class', '/ruby/treat/bin/stanford/classifiers/all.3class.distsim.crf.ser.gz')
24
24
  properties.set_property('ner.model.7class', '/ruby/treat/bin/stanford/classifiers/muc.7class.distsim.crf.ser.gz')
25
25
  properties.set_property('ner.model.MISCclass', '/ruby/treat/bin/stanford/classifiers/conll.4class.distsim.crf.ser.gz')
26
- properties.set_property('parser.model', '/ruby/treat/bin/stanford_parser/grammar/englishPCFG.ser.gz')
26
+ properties.set_property('parser.model', '/ruby/treat/bin/stanford-parser/grammar/englishPCFG.ser.gz')
27
27
  silence_stream(STDOUT) do
28
28
  pipeline = StanfordCoreNLP.new(properties)
29
29
  end
@@ -2,9 +2,9 @@ module Treat
2
2
  module Extractors
3
3
  module Time
4
4
  class Chronic
5
- silently { require 'chronic' }
5
+ silence_warnings { require 'chronic' }
6
6
  def self.time(entity, options = {})
7
- silently { ::Chronic.parse(entity.to_s, {:guess => true}) }
7
+ silence_warnings { ::Chronic.parse(entity.to_s, {:guess => true}) }
8
8
  end
9
9
  end
10
10
  end
@@ -15,9 +15,9 @@ module Treat
15
15
  =end
16
16
  module Nickel
17
17
  require 'date'
18
- silently { require 'nickel' }
18
+ silence_warnings { require 'nickel' }
19
19
  def self.time(entity, options = {})
20
- n = silently { ::Nickel.parse(entity.to_s) }
20
+ n = silence_warnings { ::Nickel.parse(entity.to_s) }
21
21
  occ = n.occurrences[0]
22
22
  # Find the words..
23
23
  rec = occ.type.to_s.gsub('single', 'once').intern
@@ -11,12 +11,12 @@ module Treat
11
11
  # Machine Learning Research. 3 (Mar. 2003), 993-1022.
12
12
  class LDA
13
13
  # Require the lda-ruby gem.
14
- silently { require 'lda-ruby' }
14
+ silence_warnings { require 'lda-ruby' }
15
15
  # Monkey patch the TextCorpus class to call it without
16
16
  # having to create any files.
17
17
  Lda::TextCorpus.class_eval do
18
18
  # Ruby, Y U NO SHUT UP!
19
- silently { undef :initialize }
19
+ silence_warnings { undef :initialize }
20
20
  # Redefine initialize to take in an array of texts.
21
21
  def initialize(texts)
22
22
  super(nil)
@@ -1,11 +1,11 @@
1
1
  module Treat
2
2
  # Extractors extract specific information out of texts.
3
3
  module Extractors
4
- # Extracts a DateTime object containing a timestamp
5
- # from string representation of date/time.
4
+ # Extracts the time of an object and annotates it
5
+ # with specific information regarding time.
6
6
  module Time
7
7
  extend Group
8
- self.type = :computer
8
+ self.type = :annotator
9
9
  self.targets = [:word, :constituent, :symbol]
10
10
  end
11
11
  # Extract the topic from a text.
@@ -20,22 +20,25 @@ module Treat
20
20
  self.type = :annotator
21
21
  self.targets = [:collection, :document, :text, :zone, :sentence]
22
22
  end
23
- module Statistics
24
- extend Group
25
- self.type = :computer
26
- self.targets = [:entity]
27
- self.default = :none
28
- end
23
+ # Extract named entities from texts.
29
24
  module NamedEntity
30
25
  extend Group
31
26
  self.type = :computer
32
27
  self.targets = [:entity]
33
28
  end
29
+ # Extract the key sentences from a text.
34
30
  module KeySentences
35
31
  extend Group
36
32
  self.type = :computer
37
33
  self.targets = [:collection, :document, :text, :zone, :sentence]
38
34
  end
35
+ # This module should be moved out of here ASAP.
36
+ module Statistics
37
+ extend Group
38
+ self.type = :computer
39
+ self.targets = [:entity]
40
+ self.default = :none
41
+ end
39
42
  extend Treat::Category
40
43
  end
41
44
  end
data/lib/treat/feature.rb CHANGED
@@ -1,4 +1,9 @@
1
1
  module Treat
2
+ # This class represents a probabilistic feature;
3
+ # it is currently not used, because its
4
+ # behaviour is non-deterministic. Perhaps at
5
+ # some point this will be of value for specific
6
+ # algorithms and so I'm keeping it here.
2
7
  class Feature
3
8
  # Undefine all methods, except those that
4
9
  # create any problems (e.g. with serializing).
@@ -26,7 +31,7 @@ module Treat
26
31
  end
27
32
  end
28
33
  # Normalize the probabilities, so that
29
- # the sum of all probabilities is one,
34
+ # the sum of all probabilities is 1,
30
35
  # except if the sum of all probabilities
31
36
  # is already below one (in which case we
32
37
  # assume that the feature is intentionally
@@ -2,7 +2,7 @@ module Treat
2
2
  module Formatters
3
3
  module Cleaners
4
4
  class HTML
5
- silently { require 'hpricot' }
5
+ silence_warnings { require 'hpricot' }
6
6
  def self.clean(document, options = {})
7
7
  document.each_text do |text|
8
8
  text.set :html_value, text.value
@@ -10,6 +10,13 @@ module Treat
10
10
  self.targets = [:collection, :document]
11
11
  self.default = :autoselect
12
12
  end
13
+ # Serializers transform entities into a storable format.
14
+ module Serializers
15
+ extend Group
16
+ self.type = :computer
17
+ self.targets = [:entity]
18
+ self.default = :yaml
19
+ end
13
20
  # Unserializers recreate entities from a serialized format.
14
21
  module Unserializers
15
22
  extend Group
@@ -24,14 +31,7 @@ module Treat
24
31
  self.targets = [:entity]
25
32
  self.default = :tree
26
33
  end
27
- # Serializers transform entities into a storable format.
28
- module Serializers
29
- extend Group
30
- self.type = :computer
31
- self.targets = [:entity]
32
- self.default = :yaml
33
- end
34
- # Serializers transform entities into a storable format.
34
+ # Cleaners strip a text from its mark up.
35
35
  module Cleaners
36
36
  extend Group
37
37
  self.type = :annotator