treat 2.0.3 → 2.0.4

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (36) hide show
  1. data/lib/treat/config/data/languages/agnostic.rb +6 -3
  2. data/lib/treat/config/data/languages/english.rb +1 -1
  3. data/lib/treat/config/data/workers/extractors.rb +8 -0
  4. data/lib/treat/loaders/stanford.rb +2 -0
  5. data/lib/treat/version.rb +1 -1
  6. data/lib/treat/workers/extractors/distance/levenshtein.rb +35 -0
  7. data/lib/treat/workers/extractors/name_tag/stanford.rb +4 -1
  8. data/lib/treat/workers/extractors/similarity/jaro_winkler.rb +38 -0
  9. data/lib/treat/workers/extractors/similarity/tf_idf.rb +19 -3
  10. data/lib/treat/workers/extractors/time/chronic.rb +6 -41
  11. data/lib/treat/workers/extractors/time/kronic.rb +20 -0
  12. data/lib/treat/workers/extractors/time/nickel.rb +0 -15
  13. data/lib/treat/workers/extractors/time/ruby.rb +2 -33
  14. data/lib/treat/workers/lexicalizers/taggers/stanford.rb +11 -10
  15. data/lib/treat/workers/processors/parsers/stanford.rb +60 -112
  16. data/spec/entities/collection.rb +29 -25
  17. data/spec/entities/document.rb +45 -44
  18. data/spec/entities/entity.rb +295 -294
  19. data/spec/entities/phrase.rb +21 -17
  20. data/spec/entities/token.rb +43 -40
  21. data/spec/entities/word.rb +5 -1
  22. data/spec/entities/zone.rb +26 -22
  23. data/spec/helper.rb +7 -2
  24. data/spec/learning/data_set.rb +145 -141
  25. data/spec/learning/export.rb +46 -42
  26. data/spec/learning/problem.rb +114 -110
  27. data/spec/learning/question.rb +46 -42
  28. data/spec/treat.rb +41 -37
  29. data/spec/workers/agnostic.rb +2 -2
  30. data/spec/workers/english.rb +12 -12
  31. metadata +7 -8
  32. data/files/21552208.html +0 -786
  33. data/files/nethttp-cheat-sheet-2940.html +0 -393
  34. data/lib/treat/workers/extractors/similarity/levenshtein.rb +0 -36
  35. data/spec/sandbox.rb +0 -294
  36. data/spec/workers/examples/english/mathematicians/euler.html +0 -21
@@ -4,9 +4,10 @@
4
4
  'bson_ext', 'mongo', 'lda-ruby',
5
5
  'stanford-core-nlp', 'linguistics',
6
6
  'ruby-readability', 'whatlanguage',
7
- 'chronic', 'nickel', 'decisiontree',
7
+ 'chronic', 'kronic', 'nickel', 'decisiontree',
8
8
  'rb-libsvm', 'ruby-fann', 'zip', 'loggability',
9
- 'tf-idf-similarity', 'narray', 'fastimage'
9
+ 'tf-idf-similarity', 'narray', 'fastimage',
10
+ 'fuzzy-string-match', 'levenshtein-ffi'
10
11
  ],
11
12
  workers: {
12
13
  learners: {
@@ -16,7 +17,9 @@
16
17
  keywords: [:tf_idf],
17
18
  language: [:what_language],
18
19
  topic_words: [:lda],
19
- tf_idf: [:native]
20
+ tf_idf: [:native],
21
+ distance: [:levenshtein],
22
+ similarity: [:jaro_winkler, :tf_idf]
20
23
  },
21
24
  formatters: {
22
25
  serializers: [:xml, :yaml, :mongo],
@@ -14,7 +14,7 @@
14
14
  ],
15
15
  workers: {
16
16
  extractors: {
17
- time: [:chronic, :ruby, :nickel],
17
+ time: [:chronic, :kronic, :ruby, :nickel],
18
18
  topics: [:reuters],
19
19
  name_tag: [:stanford]
20
20
  },
@@ -27,5 +27,13 @@
27
27
  tf_idf: {
28
28
  type: :annotator,
29
29
  targets: [:word]
30
+ },
31
+ similarity: {
32
+ type: :computer,
33
+ targets: [:entity]
34
+ },
35
+ distance: {
36
+ type: :computer,
37
+ targets: [:entity]
30
38
  }
31
39
  }
@@ -42,6 +42,8 @@ class Treat::Loaders::Stanford
42
42
  StanfordCoreNLP.log_file = '/dev/null'
43
43
  end
44
44
 
45
+ StanfordCoreNLP.bind
46
+
45
47
  @@loaded = true
46
48
 
47
49
  end
data/lib/treat/version.rb CHANGED
@@ -1,7 +1,7 @@
1
1
  module Treat
2
2
 
3
3
  # The current version of Treat.
4
- VERSION = "2.0.3"
4
+ VERSION = "2.0.4"
5
5
 
6
6
  # Treat requires Ruby >= 1.9.2
7
7
  if RUBY_VERSION < '1.9.2'
@@ -0,0 +1,35 @@
1
+ # The C extension uses char* strings, and so Unicode strings
2
+ # will give incorrect distances. Need to provide a pure
3
+ # implementation if that's the case (FIX).
4
+ class Treat::Workers::Extractors::Distance::Levenshtein
5
+
6
+ require 'levenshtein'
7
+
8
+ DefaultOptions = {
9
+ ins_cost: 1,
10
+ del_cost: 1,
11
+ sub_cost: 1
12
+ }
13
+
14
+ @@matcher = nil
15
+
16
+ # Return the levensthein distance between
17
+ # two strings taking into account the costs
18
+ # of insertion, deletion, and substitution.
19
+ def self.distance(entity, options)
20
+
21
+ options = DefaultOptions.merge(options)
22
+
23
+ unless options[:to]
24
+ raise Treat::Exception, "Must supply " +
25
+ "a string/entity to compare to using " +
26
+ "the option :to for this worker."
27
+ end
28
+
29
+ a, b = entity.to_s, options[:to].to_s
30
+
31
+ Levenshtein.distance(a, b)
32
+
33
+ end
34
+
35
+ end
@@ -1,7 +1,7 @@
1
1
  # Named entity tag extraction using the Stanford NLP
2
2
  # Deterministic Coreference Resolver, which implements a
3
3
  # multi-pass sieve coreference resolution (or anaphora
4
- # resolution) system.
4
+ # resolution) system based on conditional random fields.
5
5
  #
6
6
  # Original paper: Heeyoung Lee, Yves Peirsman, Angel
7
7
  # Chang, Nathanael Chambers, Mihai Surdeanu, Dan Jurafsky.
@@ -24,6 +24,9 @@ class Treat::Workers::Extractors::NameTag::Stanford
24
24
 
25
25
  unless classifier = @@classifiers[language]
26
26
  model = Treat::Loaders::Stanford.find_model(:ner, language)
27
+ unless StanfordCoreNLP.const_defined?('CRFClassifier')
28
+ StanfordCoreNLP.load_class('CRFClassifier', 'edu.stanford.nlp.ie.crf')
29
+ end
27
30
  classifier = StanfordCoreNLP::CRFClassifier.getClassifier(model)
28
31
  @@classifiers[language] = classifier
29
32
  end
@@ -0,0 +1,38 @@
1
+ # Similarity measure for short strings such as person names.
2
+ # C extension won't work for Unicode strings; need to set
3
+ # extension to "pure" in that case (FIX).
4
+ class Treat::Workers::Extractors::Similarity::JaroWinkler
5
+
6
+ require 'fuzzystringmatch'
7
+
8
+ DefaultOptions = {
9
+ threshold: 0.7,
10
+ implementation: nil
11
+ }
12
+
13
+ @@matcher = nil
14
+
15
+ def self.similarity(entity, options={})
16
+
17
+ options = DefaultOptions.merge(options)
18
+
19
+ unless options[:to]
20
+ raise Treat::Exception, "Must supply " +
21
+ "a string/entity to compare to using " +
22
+ "the option :to for this worker."
23
+ end
24
+
25
+ unless @@matcher
26
+ impl = options[:implementation]
27
+ impl ||= defined?(JRUBY_VERSION) ? :pure : :native
28
+ klass = FuzzyStringMatch::JaroWinkler
29
+ @@matcher = klass.create(impl)
30
+ end
31
+
32
+ a, b = entity.to_s, options[:to].to_s
33
+
34
+ @@matcher.getDistance(a, b)
35
+
36
+ end
37
+
38
+ end
@@ -2,12 +2,28 @@
2
2
  class Treat::Workers::Extractors::Similarity::TfIdf
3
3
 
4
4
  require 'tf-idf-similarity'
5
+
6
+ def self.similarity(entity, options={})
5
7
 
6
- @collections = {}
8
+ raise 'Not currently implemented.'
9
+
10
+ unless options[:to] &&
11
+ options[:to].type == :document
12
+ raise Treat::Exception, 'Must supply ' +
13
+ 'a document to compare to using ' +
14
+ 'the option :to for this worker.'
15
+ end
7
16
 
8
- def self.tf_idf(collection, options={})
17
+ unless options[:to].parent_collection &&
18
+ entity.parent_collection
19
+ raise Treat::Exception, 'The TF*IDF ' +
20
+ 'similarity algorithm can only be applied ' +
21
+ 'to documents that are inside collections.'
22
+ end
23
+
9
24
  coll = TfIdfSimilarity::Collection.new
10
- collection.each_document do |doc|
25
+
26
+ entity.each_document do |doc|
11
27
  tdoc = TfIdfSimilarity::Document.new(doc.to_s)
12
28
  term_counts = Hash.new(0)
13
29
  doc.each_word do |word|
@@ -2,52 +2,17 @@
2
2
  # Ruby natural language date parser.
3
3
  class Treat::Workers::Extractors::Time::Chronic
4
4
 
5
- # Require the 'chronic' gem.
6
- silence_warnings { require 'chronic' }
7
-
8
- # Require the Ruby DateTime module
5
+ require 'chronic'
9
6
  require 'date'
7
+
8
+ DefaultOptions = {guess: true}
10
9
 
11
10
  # Return the date information contained within
12
11
  # the entity by parsing it with the 'chronic' gem.
13
- #
14
- # Options: none.
15
12
  def self.time(entity, options = {})
16
-
17
- s = entity.to_s
18
- return if s =~ /^[0-9]+$/
19
- time = nil
20
-
21
- silence_warnings do
22
- time = ::Chronic.parse(s, {:guess => true})
23
- end
24
-
25
- if entity.has_parent? && remove_time_from_ancestors(entity, time)
26
- nil
27
- else
28
- time
29
- end
30
-
31
- end
32
-
33
- # Keeps the lowest-level time annotations that do
34
- # not conflict with a higher time annotation.
35
- # Returns true if the entity conflicts with a
36
- # higher-level time annotation.
37
- def self.remove_time_from_ancestors(entity, time)
38
-
39
- entity.ancestors_with_type(:phrase).each do |a|
40
-
41
- next if !a.has?(:time)
42
- unless a.get(:time) == time
43
- return true
44
- end
45
- a.unset(:time)
46
-
47
- end
48
-
49
- false
50
-
13
+ options = DefaultOptions.merge(options)
14
+ time = ::Chronic.parse(entity.to_s, options)
15
+ time ? DateTime.parse(time.to_s) : nil
51
16
  end
52
17
 
53
18
  end
@@ -0,0 +1,20 @@
1
+ # Time/date extraction using a simple rule-based library.
2
+ #
3
+ # Supported formats: Today, yesterday, tomorrow,
4
+ # last thursday, this thursday, 14 Sep, 14 June 2010.
5
+ # Any dates without a year are assumed to be in the past.
6
+ class Treat::Workers::Extractors::Time::Kronic
7
+
8
+ require 'kronic'
9
+ require 'date'
10
+
11
+ # Return the date information contained within
12
+ # the entity by parsing it with the 'chronic' gem.
13
+ #
14
+ # Options: none.
15
+ def self.time(entity, options = {})
16
+ time = Kronic.parse(entity.to_s)
17
+ time.is_a?(DateTime) ? time : nil
18
+ end
19
+
20
+ end
@@ -47,7 +47,6 @@ class Treat::Workers::Extractors::Time::Nickel
47
47
  occ.interval : :none
48
48
  time_recurrence_interval = interval
49
49
 
50
-
51
50
  s = [occ.start_date, occ.start_time]
52
51
  ds = [s[0].year, s[0].month, s[0].day] if s[0]
53
52
  ts = [s[1].hour, s[1].minute, s[1].second] if s[1]
@@ -77,18 +76,4 @@ class Treat::Workers::Extractors::Time::Nickel
77
76
 
78
77
  end
79
78
 
80
- # Keeps the lowest-level time annotations that do
81
- # not conflict with a higher time annotation.
82
- # Returns true if the entity conflicts with a
83
- # higher-level time annotation.
84
- def self.remove_time_from_ancestors(entity, time)
85
- entity.ancestors_with_type(:phrase).each do |a|
86
- next if !a.has?(:time)
87
- return false unless a.get(:time).to_s == time.to_s
88
- a.unset(:time, :time_recurrence,
89
- :time_recurrence_interval, :end_time)
90
- end
91
- true
92
- end
93
-
94
79
  end
@@ -2,7 +2,7 @@
2
2
  # DateTime.parse() method.
3
3
  class Treat::Workers::Extractors::Time::Ruby
4
4
 
5
- # Require Ruby's date module.
5
+
6
6
  require 'date'
7
7
 
8
8
  # Return a DateTime object representing the date/time
@@ -13,42 +13,11 @@ class Treat::Workers::Extractors::Time::Ruby
13
13
  #
14
14
  # Options: none.
15
15
  def self.time(entity, options = {})
16
- s = entity.to_s
17
- return if s =~ /^[0-9]+$/
18
16
  begin
19
- time = ::DateTime.parse(s)
20
- if entity.has_parent? &&
21
- remove_time_from_ancestors(entity, time)
22
- nil
23
- else
24
- time
25
- end
17
+ DateTime.parse(entity.to_s)
26
18
  rescue
27
19
  nil
28
20
  end
29
21
  end
30
22
 
31
-
32
- # Keeps the lowest-level time annotations that do
33
- # not conflict with a higher time annotation.
34
- # Returns true if the entity conflicts with a
35
- # higher-level time annotation.
36
- def self.remove_time_from_ancestors(entity, time)
37
-
38
- entity.ancestors_with_type(:phrase).each do |a|
39
-
40
- next if !a.has?(:time)
41
-
42
- unless a.get(:time) == time
43
- return true
44
- end
45
-
46
- a.unset(:time)
47
-
48
- end
49
-
50
- false
51
-
52
- end
53
-
54
23
  end
@@ -1,10 +1,10 @@
1
- # POS tagging using (i) explicit use of both preceding
2
- # and following tag contexts via a dependency network
3
- # representation, (ii) broad use of lexical features,
4
- # including jointly conditioning on multiple consecutive
5
- # words, (iii) effective use of priors in conditional
6
- # loglinear models, and (iv) fine-grained modeling of
7
- # unknown word features.
1
+ # POS tagging using a maximum entropy model, with (i)
2
+ # explicit use of both preceding and following tag
3
+ # contexts via a dependency network representation,
4
+ # (ii) broad use of lexical features, including jointly
5
+ # conditioning on multiple consecutive words, (iii)
6
+ # effective use of priors in conditional loglinear models,
7
+ # and (iv) fine-grained modeling of unknown word features.
8
8
  #
9
9
  # Original paper: Toutanova, Manning, Klein and Singer.
10
10
  # 2003. Feature-Rich Part-of-Speech Tagging with a
@@ -21,9 +21,6 @@ class Treat::Workers::Lexicalizers::Taggers::Stanford
21
21
  :tagger_model => nil
22
22
  }
23
23
 
24
- # Shortcut for gem config.
25
- Config = StanfordCoreNLP::Config
26
-
27
24
  # Tag the word using one of the Stanford taggers.
28
25
  def self.tag(entity, options = {})
29
26
 
@@ -64,6 +61,10 @@ class Treat::Workers::Lexicalizers::Taggers::Stanford
64
61
  def self.init_tagger(language)
65
62
  unless @@taggers[language]
66
63
  Treat::Loaders::Stanford.load(language)
64
+ unless StanfordCoreNLP.const_defined?('MaxentTagger')
65
+ StanfordCoreNLP.load_class('MaxentTagger',
66
+ 'edu.stanford.nlp.tagger.maxent')
67
+ end
67
68
  model = Treat::Loaders::Stanford.find_model(:pos,language)
68
69
  tagger = StanfordCoreNLP::MaxentTagger.new(model)
69
70
  @@taggers[language] = tagger
@@ -1,140 +1,88 @@
1
- # Parsing using an interface to a Java implementation
2
- # of probabilistic natural language parsers, both
3
- # optimized PCFG and lexicalized dependency parsers,
4
- # and a lexicalized PCFG parser.
5
- #
6
- # Original paper: Dan Klein and Christopher D.
7
- # Manning. 2003. Accurate Unlexicalized Parsing.
8
- # Proceedings of the 41st Meeting of the Association
1
+ # Parsing using an interface to a Java implementation
2
+ # of probabilistic natural language parsers, both
3
+ # optimized PCFG and lexicalized dependency parsers,
4
+ # and a lexicalized PCFG parser.
5
+ #
6
+ # Original paper: Dan Klein and Christopher D.
7
+ # Manning. 2003. Accurate Unlexicalized Parsing.
8
+ # Proceedings of the 41st Meeting of the Association
9
9
  # for Computational Linguistics, pp. 423-430.
10
10
  class Treat::Workers::Processors::Parsers::Stanford
11
-
11
+
12
12
  Pttc = Treat.tags.aligned.phrase_tags_to_category
13
-
13
+
14
14
  # Hold one instance of the pipeline per language.
15
15
  @@parsers = {}
16
16
 
17
- DefaultOptions = {
18
- :parser_model => nil,
19
- :tagger_model => nil
20
- }
17
+ DefaultOptions = { model: nil }
21
18
 
22
19
  # Parse the entity using the Stanford parser.
23
- #
24
- # Options:
25
- #
26
- # - (Boolean) :silent => whether to silence the output
27
- # of the JVM.
28
- # - (String) :log_file => a filename to log output to
29
- # instead of displaying it.
30
20
  def self.parse(entity, options = {})
31
21
 
32
- val, lang = entity.to_s, entity.language
33
- init(lang, options) unless @@parsers[lang]
34
-
35
- entity.check_hasnt_children
22
+ val, lang = entity.to_s, entity.language.intern
23
+
24
+ Treat::Loaders::Stanford.load(lang)
36
25
 
37
26
  tag_set = StanfordCoreNLP::Config::TagSets[lang]
38
27
 
39
- text = ::StanfordCoreNLP::Annotation.new(val)
40
- @@parsers[lang].annotate(text)
41
-
42
- text.get(:sentences).each do |s|
43
-
44
- if entity.is_a?(Treat::Entities::Sentence) ||
45
- entity.is_a?(Treat::Entities::Phrase)
46
- tag = s.get(:category).to_s
47
- tag_s, tag_opt = *tag.split('-')
48
- tag_s ||= 'S'
49
- entity.set :tag, tag_s
50
- entity.set :tag_opt, tag_opt if tag_opt
51
- recurse(s.get(:tree).children[0], entity, tag_set)
52
- break ####### ? FIX
53
- else
54
- recurse(s.get(:tree), entity, tag_set)
55
- end
56
-
28
+ list = get_token_list(entity)
29
+ entity.remove_all!
30
+
31
+ model_file = options[:model] ||
32
+ StanfordCoreNLP::Config::Models[:parse][lang]
33
+
34
+ unless @@parsers[lang] && @@parsers[lang][model_file]
35
+ model_path = Treat.libraries.stanford.model_path ||
36
+ StanfordCoreNLP.model_path
37
+ model_folder = StanfordCoreNLP::Config::ModelFolders[:parse]
38
+ model = File.join(model_path, model_folder, model_file)
39
+ @@parsers[lang] ||= {}
40
+ options = StanfordCoreNLP::Options.new
41
+ parser = StanfordCoreNLP::LexicalizedParser
42
+ .getParserFromFile(model, options)
43
+ @@parsers[lang][model_file] = parser
57
44
  end
58
-
59
- entity.set :tag_set, tag_set
60
45
 
61
- end
46
+ parser = @@parsers[lang][model_file]
47
+
48
+ text = parser.apply(list)
49
+
50
+ recurse(text.children[0], entity, tag_set)
51
+ entity.set :tag_set, tag_set
62
52
 
63
- def self.init(lang, options)
64
- Treat::Loaders::Stanford.load(lang)
65
- options = DefaultOptions.merge(options)
66
- StanfordCoreNLP.use(lang.intern)
67
- if options[:tagger_model]
68
- StanfordCoreNLP.set_model('pos.model', options[:tagger_model])
69
- end
70
- if options[:parser_model]
71
- StanfordCoreNLP.set_model('parser.model', options[:parser_model])
72
- end
73
- annotators = [:tokenize, :ssplit, :pos, :lemma, :parse]
74
- @@parsers[lang] = StanfordCoreNLP.load(*annotators)
75
53
  end
76
54
 
77
- # Helper method which recurses the tree supplied by
78
- # the Stanford parser.
79
- def self.recurse(java_node, ruby_node, tag_set, additional_tags = [])
80
-
81
- if java_node.num_children == 0
82
-
83
- label = java_node.label
84
- tag = label.get(:part_of_speech).to_s
85
- tag_s, tag_opt = *tag.split('-')
86
- tag_s ||= ''
87
- ruby_node.value = java_node.value.to_s.strip
88
- ruby_node.set :tag, tag_s
89
- ruby_node.set :tag_opt, tag_opt if tag_opt
90
- ruby_node.set :lemma, label.get(:lemma).to_s
91
-
92
- additional_tags.each do |t|
93
- lt = label.get(t)
94
- ruby_node.set t, lt.to_s if lt
95
- end
96
-
97
- ruby_node
98
-
99
- else
100
-
101
- if java_node.num_children == 1 &&
102
- java_node.children[0].num_children == 0
103
- recurse(java_node.children[0],
104
- ruby_node, tag_set, additional_tags)
105
- return
106
- end
55
+ def self.recurse(java_node, ruby_node, tag_set)
56
+
57
+ java_node.children.each do |java_child|
107
58
 
108
- java_node.children.each do |java_child|
109
-
110
- label = java_child.label
111
- tag = label.get(:category).to_s
112
- tag_s, tag_opt = *tag.split('-')
113
- tag_s ||= ''
114
-
115
- if Pttc[tag_s] && Pttc[tag_s][tag_set]
116
- ruby_child = Treat::Entities::Phrase.new
117
- else
118
- l = java_child.children[0].to_s
119
- v = java_child.children[0].value.to_s.strip
120
-
121
- # Mhmhmhmhmhm FIX!
122
- val = (l == v) ? v : l.split(' ')[-1].gsub(')', '')
123
- ruby_child = Treat::Entities::Token.from_string(val)
124
- end
59
+ label = java_child.label
60
+ tag = label.get(:category).to_s
125
61
 
126
- ruby_child.set :tag, tag_s
127
- ruby_child.set :tag_opt, tag_opt if tag_opt
62
+ if Pttc[tag] && Pttc[tag][tag_set]
63
+ ruby_child = Treat::Entities::Phrase.new
64
+ ruby_child.set :tag, tag
128
65
  ruby_node << ruby_child
129
-
130
66
  unless java_child.children.empty?
131
- recurse(java_child, ruby_child, tag_set, additional_tags)
67
+ recurse(java_child, ruby_child, tag_set)
132
68
  end
133
-
69
+ else
70
+ val = java_child.children[0].to_s
71
+ ruby_child = Treat::Entities::Token.from_string(val)
72
+ ruby_child.set :tag, tag
73
+ ruby_node << ruby_child
134
74
  end
135
-
75
+
136
76
  end
137
77
 
138
78
  end
139
-
79
+
80
+ def self.get_token_list(entity)
81
+ list = StanfordCoreNLP::ArrayList.new
82
+ entity.tokens.each do |token|
83
+ list.add(StanfordCoreNLP::Word.new(token.to_s))
84
+ end
85
+ list
86
+ end
87
+
140
88
  end