treat 0.1.1 → 0.1.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (81) hide show
  1. data/INSTALL +1 -0
  2. data/README +3 -0
  3. data/TODO +14 -26
  4. data/bin/INFO +1 -1
  5. data/lib/treat/buildable.rb +10 -11
  6. data/lib/treat/categories.rb +8 -6
  7. data/lib/treat/category.rb +7 -2
  8. data/lib/treat/delegatable.rb +64 -56
  9. data/lib/treat/detectors/encoding/r_chardet19.rb +1 -1
  10. data/lib/treat/detectors/language/language_detector.rb +2 -1
  11. data/lib/treat/detectors/language/what_language.rb +2 -2
  12. data/lib/treat/detectors.rb +3 -0
  13. data/lib/treat/entities/entity.rb +1 -1
  14. data/lib/treat/entities.rb +9 -10
  15. data/lib/treat/exception.rb +3 -1
  16. data/lib/treat/extractors/named_entity/abner.rb +1 -1
  17. data/lib/treat/extractors/named_entity/stanford.rb +2 -2
  18. data/lib/treat/extractors/time/chronic.rb +2 -2
  19. data/lib/treat/extractors/time/nickel.rb +2 -2
  20. data/lib/treat/extractors/topic_words/lda.rb +2 -2
  21. data/lib/treat/extractors.rb +12 -9
  22. data/lib/treat/feature.rb +6 -1
  23. data/lib/treat/formatters/cleaners/html.rb +1 -1
  24. data/lib/treat/formatters.rb +8 -8
  25. data/lib/treat/group.rb +11 -10
  26. data/lib/treat/inflectors/cardinal_words/linguistics.rb +3 -3
  27. data/lib/treat/inflectors/{conjugators → conjugations}/linguistics.rb +6 -6
  28. data/lib/treat/inflectors/{declensors → declensions}/en.rb +2 -2
  29. data/lib/treat/inflectors/{declensors → declensions}/linguistics.rb +5 -5
  30. data/lib/treat/inflectors/ordinal_words/linguistics.rb +4 -4
  31. data/lib/treat/inflectors/{stemmers → stem}/porter.rb +1 -1
  32. data/lib/treat/inflectors/{stemmers → stem}/porter_c.rb +3 -3
  33. data/lib/treat/inflectors/{stemmers → stem}/uea.rb +3 -3
  34. data/lib/treat/inflectors.rb +8 -21
  35. data/lib/treat/kernel.rb +120 -0
  36. data/lib/treat/languages/arabic.rb +14 -0
  37. data/lib/treat/languages/categories.rb +5 -0
  38. data/lib/treat/languages/chinese.rb +12 -0
  39. data/lib/treat/languages/english/categories.rb +23 -0
  40. data/lib/treat/{resources → languages/english}/tags.rb +127 -184
  41. data/lib/treat/languages/english.rb +33 -0
  42. data/lib/treat/languages/french.rb +17 -0
  43. data/lib/treat/languages/german.rb +17 -0
  44. data/lib/treat/languages/italian.rb +14 -0
  45. data/lib/treat/{resources/languages.txt → languages/list.txt} +0 -0
  46. data/lib/treat/languages/xinhua.rb +12 -0
  47. data/lib/treat/languages.rb +91 -0
  48. data/lib/treat/lexicalizers/category/from_tag.rb +20 -8
  49. data/lib/treat/lexicalizers/synsets/rita_wn.rb +1 -1
  50. data/lib/treat/lexicalizers/tag/brill.rb +2 -1
  51. data/lib/treat/lexicalizers/tag/lingua.rb +2 -1
  52. data/lib/treat/lexicalizers/tag/stanford.rb +16 -15
  53. data/lib/treat/lexicalizers.rb +1 -1
  54. data/lib/treat/object.rb +6 -0
  55. data/lib/treat/processors/parsers/enju.rb +3 -2
  56. data/lib/treat/processors/parsers/stanford.rb +15 -12
  57. data/lib/treat/processors/segmenters/punkt.rb +1 -1
  58. data/lib/treat/processors/segmenters/stanford.rb +7 -5
  59. data/lib/treat/processors/segmenters/tactful.rb +1 -1
  60. data/lib/treat/processors/tokenizers/multilingual.rb +2 -2
  61. data/lib/treat/processors/tokenizers/stanford.rb +7 -5
  62. data/lib/treat/visitable.rb +2 -1
  63. data/lib/treat.rb +105 -54
  64. data/test/tc_entity.rb +5 -0
  65. data/test/tc_resources.rb +5 -5
  66. data/test/tc_treat.rb +1 -2
  67. data/test/tests.rb +2 -1
  68. metadata +63 -64
  69. data/lib/treat/formatters/serializers/yaml/helper.rb +0 -96
  70. data/lib/treat/inflectors/lemmatizers/e_lemma/Makefile +0 -213
  71. data/lib/treat/inflectors/lemmatizers/e_lemma/elemma.c +0 -68
  72. data/lib/treat/inflectors/lemmatizers/e_lemma/extconf.rb +0 -6
  73. data/lib/treat/inflectors/lemmatizers/e_lemma.rb +0 -12
  74. data/lib/treat/resources/categories.rb +0 -18
  75. data/lib/treat/resources/delegates.rb +0 -96
  76. data/lib/treat/resources/dependencies.rb +0 -0
  77. data/lib/treat/resources/edges.rb +0 -8
  78. data/lib/treat/resources/formats.rb +0 -23
  79. data/lib/treat/resources/languages.rb +0 -86
  80. data/lib/treat/resources.rb +0 -10
  81. data/lib/treat/utilities.rb +0 -127
data/INSTALL CHANGED
@@ -0,0 +1 @@
1
+ See the Wiki: https://github.com/louismullie/treat/wiki/Installing-Treat
data/README CHANGED
@@ -0,0 +1,3 @@
1
+ Treat - Text Retrieval and Annotation Toolkit
2
+
3
+ See the wiki for more information at https://github.com/louismullie/treat/wiki/.
data/TODO CHANGED
@@ -1,23 +1,29 @@
1
- == Before first release
1
+ ## Urgent
2
2
 
3
- - Linkers, inflectors
3
+ - Linkers
4
4
  - Check taggers for context
5
5
  - Stanford dependencies parse
6
- - Enju : make sure Enju parses constituents, remove_last, abbr
6
+ - Enju: test
7
7
  - Ocropus => use better function
8
- - Optimize magic methods... bigger or smaller, is_token? type methods, phrase categories.
8
+ - Optimize magic methods... is_token? type methods, phrase categories.
9
9
  - Move statistics?
10
10
  - Synset class move
11
11
  - general procedure for options, check that user doesn't want to change options...
12
- - Resources: dependencies vs. edges, PTB function tags
12
+ - Languages: dependencies vs. edges, PTB function tags
13
13
  - Check for # Fix everywhere
14
14
  - Check paths; parse bin paths
15
15
  - Ferret, Natural Inputs
16
16
  - Use consistently delegate
17
17
  - Text becomes section
18
+ - Remove top level
19
+ - Loading multiple JARs
20
+ - Tokenized sentences are not parsed
21
+ - Documentation
22
+ - Remove feature
18
23
 
19
- === Eventually
24
+ ## Eventually
20
25
 
26
+ - English inflector
21
27
  - RDF output
22
28
  - Apache OpenNLP
23
29
  - Ariel
@@ -44,24 +50,6 @@
44
50
  - Probabilistic features: rchardet19, what_language
45
51
  - Enju multithreading ?
46
52
  - String type detector for other languages
47
-
48
53
  - Automatic benchmark
49
-
50
- === Distant future
51
-
52
- - Spell Cheker Raspell
53
- - Multithreading
54
-
55
- === Checklist before releasing
56
-
57
- - Remove code from main page
58
- - Remove lib path from tests and main page
59
- - Remove docs
60
- - gem19 build treat.gemspec
61
-
62
- === Performance
63
-
64
- - Cache results that get computed often
65
- - Use .size == 0 instead of .empty?
66
- - Optimize method_missing using define_method, even dynamically
67
- - Array include is slow
54
+ - Raspell spell checker
55
+ - Multithreading
data/bin/INFO CHANGED
@@ -1 +1 @@
1
- This is where you can put your JAR files.
1
+ This is where Treat will look for the Stanford JAR files by default. You can change this to another directory by setting Treat.bin = '/path/to/your/folder/' at runtime.
@@ -1,6 +1,9 @@
1
1
  module Treat
2
+ # Represents an object that can be built
3
+ # from a folder of files, a specific file,
4
+ # a string or a numeric object. This class
5
+ # is pretty much self-explanatory.
2
6
  module Buildable
3
-
4
7
  def from_anything(file_or_value, id)
5
8
  if File.readable?(file_or_value.to_s)
6
9
  from_file(file_or_value)
@@ -9,11 +12,11 @@ module Treat
9
12
  elsif file_or_value.is_a?(Numeric)
10
13
  from_numeric(file_or_value)
11
14
  else
12
- raise "Unrecognizable input #{file_or_value}. "+
15
+ raise Treat::Exception,
16
+ "Unrecognizable input #{file_or_value}. "+
13
17
  "Use filename, folder, text or a number."
14
18
  end
15
19
  end
16
-
17
20
  def from_string(string)
18
21
  if self == Treat::Entities::Document ||
19
22
  self == Treat::Entities::Collection
@@ -37,7 +40,6 @@ module Treat
37
40
  end
38
41
  return Treat::Entities::Unknown.new(string)
39
42
  end
40
-
41
43
  def from_numeric(numeric)
42
44
  unless self == Treat::Entities::Number
43
45
  raise Treat::Exception,
@@ -46,7 +48,6 @@ module Treat
46
48
  end
47
49
  Treat::Entities::Number.new(numeric.to_s)
48
50
  end
49
-
50
51
  def from_folder(folder)
51
52
  unless FileTest.directory?(folder)
52
53
  raise Treat::Exception,
@@ -69,7 +70,6 @@ module Treat
69
70
  end
70
71
  c
71
72
  end
72
-
73
73
  def from_file(file)
74
74
  unless File.readable?(file)
75
75
  raise Treat::Exception,
@@ -79,7 +79,8 @@ module Treat
79
79
  from_folder(file)
80
80
  else
81
81
  ext = file.split('.')[-1]
82
- ext = 'yaml' if ext == 'yml' # Humanize the yml extension.
82
+ # Humanize the yaml extension.
83
+ ext = 'yaml' if ext == 'yml'
83
84
  if Treat::Formatters::Unserializers.list.
84
85
  include?(ext.downcase.intern)
85
86
  from_serialized_file(file)
@@ -88,7 +89,6 @@ module Treat
88
89
  end
89
90
  end
90
91
  end
91
-
92
92
  def from_raw_file(file)
93
93
  unless self == Treat::Entities::Document
94
94
  raise Treat::Exception,
@@ -98,9 +98,9 @@ module Treat
98
98
  d = Treat::Entities::Document.new(file)
99
99
  d.read
100
100
  end
101
-
102
101
  def from_serialized_file(file)
103
- unless [Treat::Entities::Document, Treat::Entities::Collection].include?(self)
102
+ unless [Treat::Entities::Document,
103
+ Treat::Entities::Collection].include?(self)
104
104
  raise Treat::Exception,
105
105
  "Cannot create something else than a " +
106
106
  "document from raw file '#{file}'."
@@ -110,6 +110,5 @@ module Treat
110
110
  d.children[0].set_as_root!
111
111
  d.children[0]
112
112
  end
113
-
114
113
  end
115
114
  end
@@ -1,16 +1,18 @@
1
1
  module Treat
2
+ # This module keeps track of all categories that
3
+ # exist and the methods they implement, and is
4
+ # responsible for including the categories.
2
5
  module Categories
3
- # Modify the module that includes Category to
4
- # setup autoload, delegators and provide a list
5
- # of methods.
6
+ # A list of categories.
6
7
  class << self; attr_accessor :list; end
7
8
  self.list = []
8
- # Boolean - does any of the categories
9
- # groups respond to the symbol.
9
+ # Boolean - does any of the categories have
10
+ # a method that corresponds to sym?
10
11
  def self.have_method?(sym); methods.include?(sym); end
12
+ # Cache the list of methods once it has been computed.
13
+ @@methods = []
11
14
  # Provide a list of all methods implemented
12
15
  # by all Treat categories.
13
- @@methods = []
14
16
  def self.methods
15
17
  return @@methods unless @@methods.empty?
16
18
  self.list.each do |ns|
@@ -1,7 +1,11 @@
1
1
  module Treat
2
- # Provides functions common to all algorithm categories.
2
+ # Clusters together groups of algorithms that
3
+ # perform similar functions.
3
4
  module Category
5
+ # Require the Group class.
4
6
  require 'treat/group'
7
+ # Add delegators to the Entities based on the
8
+ # configuration for a given category.
5
9
  def self.extended(category)
6
10
  Treat::Categories.list << category
7
11
  category.module_eval do
@@ -14,9 +18,10 @@ module Treat
14
18
  end
15
19
  end
16
20
  end
21
+ # Provides a list of groups within this category.
17
22
  def groups; self.constants; end
18
23
  # Provide a list of methods implemented in
19
- # the groups contained within that
24
+ # the groups contained within this category.
20
25
  def methods
21
26
  methods = []
22
27
  groups.each do |group|
@@ -2,34 +2,8 @@ module Treat
2
2
  # Makes a class delegatable, allowing calls on it to be forwarded
3
3
  # to a delegate class performing the appropriate call.
4
4
  module Delegatable
5
-
6
- # Get the default delegate for that language
7
- # inside the given group.
8
- def get_language_delegate(language, group)
9
- lang = Treat::Resources::Languages.describe(language)
10
- lclass = cc(lang).intern
11
- if Treat::Resources::Delegates.constants.include?(lclass)
12
- cat = group.to_s.split('::')[-2].intern
13
- lclass = Treat::Resources::Delegates.
14
- const_get(lclass).const_get(cat)
15
- g = ucc(cl(group)).intern
16
- if !lclass[g] || !lclass[g][0]
17
- d = ucc(cl(group))
18
- d.gsub!('_', ' ')
19
- d = d[0..-2] if d[-1] == 's'
20
- d = 'delegator to find ' + d
21
- raise Treat::Exception, "No #{d}" +
22
- " is available for the #{lang} language."
23
- end
24
- return lclass[g][0]
25
- else
26
- raise Treat::Exception,
27
- "Language '#{lang}' is not supported (yet)."
28
- end
29
- end
30
-
31
5
  # Add decorator methods to entities.
32
- def decorate(group, m)
6
+ def add_decorators(group, m)
33
7
  decorators = group.methods -
34
8
  Object.methods -
35
9
  [:type, :type=, :targets, :targets=,
@@ -42,49 +16,83 @@ module Treat
42
16
  end
43
17
  end
44
18
  end
45
-
46
- # Raise an exception and suggest alternatives.
47
- def delegate_not_found(klass, group)
48
- "Algorithm '#{ucc(klass)}' couldn't be found in group #{group}." +
49
- did_you_mean?(group.list.map { |c| ucc(c) }, ucc(klass))
50
- end
51
-
52
19
  # Add delegator group to all entities of a class.
53
20
  def add_delegators(group)
54
21
  # Define each method in group.
55
22
  self.class_eval do
56
23
  m = group.method
57
- decorate(group, m)
24
+ add_decorators(group, m)
58
25
  define_method(m) do |delegate=nil, options={}|
59
26
  decorator = options.delete(:decorator)
60
27
  puts self.id if !@features
61
28
  if !@features[m].nil?
62
29
  @features[m]
63
30
  else
64
- if delegate.nil?
65
- delegate = group.default.nil? ?
66
- self.class.get_language_delegate(language, group) :
67
- group.default
68
- raise "No default delegate for #{group}." if delegate == :none
69
- end
70
- if not group.list.include?(delegate)
71
- raise Treat::Exception,
72
- self.class.delegate_not_found(delegate, group)
73
- else
74
- delegate_klass = group.const_get(:"#{cc(delegate.to_s)}")
75
- result = accept(group, delegate_klass, m, options)
76
- if decorator
77
- result = group.send(decorator, self, result)
78
- end
79
- if group.type == :annotator
80
- f = decorator.nil? ? m : decorator
81
- @features[f] = result
82
- end
83
- result
84
- end
31
+ self.class.call_delegator(
32
+ self, m, delegate, decorator,
33
+ group, options)
85
34
  end
86
35
  end
87
36
  end
88
37
  end
38
+ # Call a delegator.
39
+ def call_delegator(entity, m, delegate, decorator, group, options)
40
+ if delegate.nil?
41
+ delegate = get_missing_delegate(entity, group)
42
+ end
43
+ if not group.list.include?(delegate)
44
+ raise Treat::Exception, delegate_not_found(delegate, group)
45
+ else
46
+ delegate_klass = group.const_get(:"#{cc(delegate.to_s)}")
47
+ result = entity.accept(group, delegate_klass, m, options)
48
+ if decorator
49
+ result = group.send(decorator, self, result)
50
+ end
51
+ if group.type == :annotator
52
+ f = decorator.nil? ? m : decorator
53
+ entity.features[f] = result
54
+ end
55
+ result
56
+ end
57
+ end
58
+ # Get the default delegate for that language
59
+ # inside the given group.
60
+ def get_language_delegate(language, group)
61
+ lang = Treat::Languages.describe(language)
62
+ lclass = cc(lang).intern
63
+ if Treat::Languages.constants.include?(lclass)
64
+ cat = group.to_s.split('::')[-2].intern
65
+ lclass = Treat::Languages.const_get(lclass).const_get(cat)
66
+ g = ucc(cl(group)).intern
67
+ if !lclass[g] || !lclass[g][0]
68
+ d = ucc(cl(group))
69
+ d.gsub!('_', ' ')
70
+ d = d[0..-2] if d[-1] == 's'
71
+ d = 'delegator to find ' + d
72
+ raise Treat::Exception, "No #{d}" +
73
+ " is available for the #{lang} language."
74
+ end
75
+ return lclass[g][0]
76
+ else
77
+ raise Treat::Exception,
78
+ "Language '#{lang}' is not supported (yet)."
79
+ end
80
+ end
81
+ # Get which delegate to use if none has been supplied.
82
+ def get_missing_delegate(entity, group)
83
+ delegate = group.default.nil? ?
84
+ self.get_language_delegate(entity.language, group) :
85
+ group.default
86
+ if delegate == :none
87
+ raise NAT::Exception,
88
+ "There is intentionally no default delegate for #{group}."
89
+ end
90
+ delegate
91
+ end
92
+ # Return an error message and suggest possible typos.
93
+ def delegate_not_found(klass, group)
94
+ "Algorithm '#{ucc(klass)}' couldn't be found in group #{group}." +
95
+ did_you_mean?(group.list.map { |c| ucc(c) }, ucc(klass))
96
+ end
89
97
  end
90
98
  end
@@ -2,7 +2,7 @@ module Treat
2
2
  module Detectors
3
3
  module Encoding
4
4
  # Require the 'rchardet19' gem.
5
- silently { require 'rchardet19' }
5
+ silence_warnings { require 'rchardet19' }
6
6
  # A wrapper for the 'rchardet19' gem, which
7
7
  # detects the encoding of a file.
8
8
  class RChardet19
@@ -9,7 +9,8 @@ module Treat
9
9
  dlvl = Treat.language_detection_level
10
10
  if (Entities.rank(entity.type) < Entities.rank(dlvl)) &&
11
11
  entity.has_parent?
12
- return entity.ancestor_with_type(dlvl).language
12
+ anc = entity.ancestor_with_type(dlvl)
13
+ return anc.language if anc
13
14
  end
14
15
  end
15
16
  end
@@ -2,7 +2,7 @@ module Treat
2
2
  module Detectors
3
3
  module Language
4
4
  # Require the 'whatlanguage' gem.
5
- silently { require 'whatlanguage' }
5
+ silence_warnings { require 'whatlanguage' }
6
6
  # Adaptor for the 'whatlanguage' gem, which
7
7
  # performs probabilistic language detection.
8
8
  class WhatLanguage < LanguageDetector
@@ -19,7 +19,7 @@ module Treat
19
19
  all = @@wl.process_text(entity.to_s)
20
20
  lang = {}
21
21
  all.each do |k,v|
22
- lang[Treat::Resources::Languages.find(k)] = v
22
+ lang[Treat::Languages.find(k)] = v
23
23
  end
24
24
  Treat::Feature.new(lang).best
25
25
  end
@@ -1,6 +1,9 @@
1
1
  module Treat
2
2
  # Detectors detect a specific meta-information about
3
3
  # an entity, such as encoding, format and language.
4
+ #
5
+ # Detectors are language-independent, and thus they
6
+ # are default algorithms specified for each of them.
4
7
  module Detectors
5
8
  # Group for algorithms that detect encoding.
6
9
  module Encoding
@@ -93,7 +93,7 @@ module Treat
93
93
  # dispatches done by Ruby to improve performance.
94
94
  def parse_magic_method(sym, *args, &block)
95
95
  @@entities_regexp ||= "(#{Treat::Entities.list.join('|')})"
96
- @@cats_regexp ||= "(#{Treat::Resources::Categories::List.join('|')})"
96
+ @@cats_regexp ||= "(#{Treat::Languages::English::Categories.join('|')})"
97
97
  method = sym.to_s =~ /entities/ ?
98
98
  sym.to_s.gsub('entities', 'entitys'):
99
99
  method = sym.to_s
@@ -4,15 +4,14 @@ module Treat
4
4
  #
5
5
  # - Collection
6
6
  # - Document
7
- # - Text
8
7
  # - Zone (a Section, Title, Paragraph, or List)
9
8
  # - Sentence
10
9
  # - Constituent (a Phrase or Clause)
11
10
  # - Token (a Word, Number, Punctuation, or Symbol).
12
11
  module Entities
13
- # Require Entity first, since the other classes
14
- # extend this class.
12
+ # Require Entity first.
15
13
  require 'treat/entities/entity'
14
+ # Then require all possible entities.
16
15
  require 'treat/entities/collection'
17
16
  require 'treat/entities/document'
18
17
  require 'treat/entities/text'
@@ -40,13 +39,13 @@ module Treat
40
39
  # comparison of entity types.
41
40
  def self.rank(type)
42
41
  klass = Entities.const_get(cc(type))
43
- return 6 if klass == Collection || klass < Collection
44
- return 5 if klass == Document || klass < Document
45
- return 4 if klass == Text || klass < Text
46
- return 3 if klass == Zone || klass < Zone
47
- return 2 if klass == Sentence || klass < Sentence
48
- return 1 if klass == Constituent || klass < Constituent
49
- return 0 if klass == Token || klass < Token
42
+ compare = lambda { |a,b| a == b || a < b }
43
+ return 0 if compare.call(klass, Token)
44
+ return 1 if compare.call(klass, Constituent)
45
+ return 2 if compare.call(klass, Sentence)
46
+ return 4 if compare.call(klass, Document)
47
+ return 3 if compare.call(klass, Section)
48
+ return 5 if compare.call(klass, Collection)
50
49
  end
51
50
  end
52
51
  end
@@ -1,5 +1,7 @@
1
1
  module Treat
2
- # Exception class for the Treat library.
2
+ # Custom exception class for the Treat toolkit.
3
+ # Used to distinguish between errors raised by
4
+ # gems or Ruby from errors raised by the toolkit.
3
5
  class Exception < ::Exception
4
6
  end
5
7
  end
@@ -3,7 +3,7 @@ module Treat
3
3
  module NamedEntity
4
4
  class Abner
5
5
  # Require the Ruby-Java bridge.
6
- silently do
6
+ silence_warnings do
7
7
  require 'rjb'
8
8
  Rjb::load('', ['-Xms256M', '-Xmx512M'])
9
9
  puts Rjb.import('tagger')
@@ -3,7 +3,7 @@ module Treat
3
3
  module NamedEntity
4
4
  class Stanford
5
5
  # Require the Ruby-Java bridge.
6
- silently do
6
+ silence_warnings do
7
7
  require 'rjb'
8
8
  Rjb::load(nil, ['-Xms256M', '-Xmx1024M'])
9
9
  Rjb::add_jar('/ruby/treat/bin/treat/treat.jar')
@@ -23,7 +23,7 @@ module Treat
23
23
  properties.set_property('ner.model.3class', '/ruby/treat/bin/stanford/classifiers/all.3class.distsim.crf.ser.gz')
24
24
  properties.set_property('ner.model.7class', '/ruby/treat/bin/stanford/classifiers/muc.7class.distsim.crf.ser.gz')
25
25
  properties.set_property('ner.model.MISCclass', '/ruby/treat/bin/stanford/classifiers/conll.4class.distsim.crf.ser.gz')
26
- properties.set_property('parser.model', '/ruby/treat/bin/stanford_parser/grammar/englishPCFG.ser.gz')
26
+ properties.set_property('parser.model', '/ruby/treat/bin/stanford-parser/grammar/englishPCFG.ser.gz')
27
27
  silence_stream(STDOUT) do
28
28
  pipeline = StanfordCoreNLP.new(properties)
29
29
  end
@@ -2,9 +2,9 @@ module Treat
2
2
  module Extractors
3
3
  module Time
4
4
  class Chronic
5
- silently { require 'chronic' }
5
+ silence_warnings { require 'chronic' }
6
6
  def self.time(entity, options = {})
7
- silently { ::Chronic.parse(entity.to_s, {:guess => true}) }
7
+ silence_warnings { ::Chronic.parse(entity.to_s, {:guess => true}) }
8
8
  end
9
9
  end
10
10
  end
@@ -15,9 +15,9 @@ module Treat
15
15
  =end
16
16
  module Nickel
17
17
  require 'date'
18
- silently { require 'nickel' }
18
+ silence_warnings { require 'nickel' }
19
19
  def self.time(entity, options = {})
20
- n = silently { ::Nickel.parse(entity.to_s) }
20
+ n = silence_warnings { ::Nickel.parse(entity.to_s) }
21
21
  occ = n.occurrences[0]
22
22
  # Find the words..
23
23
  rec = occ.type.to_s.gsub('single', 'once').intern
@@ -11,12 +11,12 @@ module Treat
11
11
  # Machine Learning Research. 3 (Mar. 2003), 993-1022.
12
12
  class LDA
13
13
  # Require the lda-ruby gem.
14
- silently { require 'lda-ruby' }
14
+ silence_warnings { require 'lda-ruby' }
15
15
  # Monkey patch the TextCorpus class to call it without
16
16
  # having to create any files.
17
17
  Lda::TextCorpus.class_eval do
18
18
  # Ruby, Y U NO SHUT UP!
19
- silently { undef :initialize }
19
+ silence_warnings { undef :initialize }
20
20
  # Redefine initialize to take in an array of texts.
21
21
  def initialize(texts)
22
22
  super(nil)
@@ -1,11 +1,11 @@
1
1
  module Treat
2
2
  # Extractors extract specific information out of texts.
3
3
  module Extractors
4
- # Extracts a DateTime object containing a timestamp
5
- # from string representation of date/time.
4
+ # Extracts the time of an object and annotates it
5
+ # with specific information regarding time.
6
6
  module Time
7
7
  extend Group
8
- self.type = :computer
8
+ self.type = :annotator
9
9
  self.targets = [:word, :constituent, :symbol]
10
10
  end
11
11
  # Extract the topic from a text.
@@ -20,22 +20,25 @@ module Treat
20
20
  self.type = :annotator
21
21
  self.targets = [:collection, :document, :text, :zone, :sentence]
22
22
  end
23
- module Statistics
24
- extend Group
25
- self.type = :computer
26
- self.targets = [:entity]
27
- self.default = :none
28
- end
23
+ # Extract named entities from texts.
29
24
  module NamedEntity
30
25
  extend Group
31
26
  self.type = :computer
32
27
  self.targets = [:entity]
33
28
  end
29
+ # Extract the key sentences from a text.
34
30
  module KeySentences
35
31
  extend Group
36
32
  self.type = :computer
37
33
  self.targets = [:collection, :document, :text, :zone, :sentence]
38
34
  end
35
+ # This module should be moved out of here ASAP.
36
+ module Statistics
37
+ extend Group
38
+ self.type = :computer
39
+ self.targets = [:entity]
40
+ self.default = :none
41
+ end
39
42
  extend Treat::Category
40
43
  end
41
44
  end
data/lib/treat/feature.rb CHANGED
@@ -1,4 +1,9 @@
1
1
  module Treat
2
+ # This class represents a probabilistic feature;
3
+ # it is currently not used, because its
4
+ # behaviour is non-deterministic. Perhaps at
5
+ # some point this will be of value for specific
6
+ # algorithms and so I'm keeping it here.
2
7
  class Feature
3
8
  # Undefine all methods, except those that
4
9
  # create any problems (e.g. with serializing).
@@ -26,7 +31,7 @@ module Treat
26
31
  end
27
32
  end
28
33
  # Normalize the probabilities, so that
29
- # the sum of all probabilities is one,
34
+ # the sum of all probabilities is 1,
30
35
  # except if the sum of all probabilities
31
36
  # is already below one (in which case we
32
37
  # assume that the feature is intentionally
@@ -2,7 +2,7 @@ module Treat
2
2
  module Formatters
3
3
  module Cleaners
4
4
  class HTML
5
- silently { require 'hpricot' }
5
+ silence_warnings { require 'hpricot' }
6
6
  def self.clean(document, options = {})
7
7
  document.each_text do |text|
8
8
  text.set :html_value, text.value
@@ -10,6 +10,13 @@ module Treat
10
10
  self.targets = [:collection, :document]
11
11
  self.default = :autoselect
12
12
  end
13
+ # Serializers transform entities into a storable format.
14
+ module Serializers
15
+ extend Group
16
+ self.type = :computer
17
+ self.targets = [:entity]
18
+ self.default = :yaml
19
+ end
13
20
  # Unserializers recreate entities from a serialized format.
14
21
  module Unserializers
15
22
  extend Group
@@ -24,14 +31,7 @@ module Treat
24
31
  self.targets = [:entity]
25
32
  self.default = :tree
26
33
  end
27
- # Serializers transform entities into a storable format.
28
- module Serializers
29
- extend Group
30
- self.type = :computer
31
- self.targets = [:entity]
32
- self.default = :yaml
33
- end
34
- # Serializers transform entities into a storable format.
34
+ # Cleaners strip a text from its mark up.
35
35
  module Cleaners
36
36
  extend Group
37
37
  self.type = :annotator