treat 2.0.3 → 2.0.4

Sign up to get free protection for your applications and to get access to all the features.
Files changed (36) hide show
  1. data/lib/treat/config/data/languages/agnostic.rb +6 -3
  2. data/lib/treat/config/data/languages/english.rb +1 -1
  3. data/lib/treat/config/data/workers/extractors.rb +8 -0
  4. data/lib/treat/loaders/stanford.rb +2 -0
  5. data/lib/treat/version.rb +1 -1
  6. data/lib/treat/workers/extractors/distance/levenshtein.rb +35 -0
  7. data/lib/treat/workers/extractors/name_tag/stanford.rb +4 -1
  8. data/lib/treat/workers/extractors/similarity/jaro_winkler.rb +38 -0
  9. data/lib/treat/workers/extractors/similarity/tf_idf.rb +19 -3
  10. data/lib/treat/workers/extractors/time/chronic.rb +6 -41
  11. data/lib/treat/workers/extractors/time/kronic.rb +20 -0
  12. data/lib/treat/workers/extractors/time/nickel.rb +0 -15
  13. data/lib/treat/workers/extractors/time/ruby.rb +2 -33
  14. data/lib/treat/workers/lexicalizers/taggers/stanford.rb +11 -10
  15. data/lib/treat/workers/processors/parsers/stanford.rb +60 -112
  16. data/spec/entities/collection.rb +29 -25
  17. data/spec/entities/document.rb +45 -44
  18. data/spec/entities/entity.rb +295 -294
  19. data/spec/entities/phrase.rb +21 -17
  20. data/spec/entities/token.rb +43 -40
  21. data/spec/entities/word.rb +5 -1
  22. data/spec/entities/zone.rb +26 -22
  23. data/spec/helper.rb +7 -2
  24. data/spec/learning/data_set.rb +145 -141
  25. data/spec/learning/export.rb +46 -42
  26. data/spec/learning/problem.rb +114 -110
  27. data/spec/learning/question.rb +46 -42
  28. data/spec/treat.rb +41 -37
  29. data/spec/workers/agnostic.rb +2 -2
  30. data/spec/workers/english.rb +12 -12
  31. metadata +7 -8
  32. data/files/21552208.html +0 -786
  33. data/files/nethttp-cheat-sheet-2940.html +0 -393
  34. data/lib/treat/workers/extractors/similarity/levenshtein.rb +0 -36
  35. data/spec/sandbox.rb +0 -294
  36. data/spec/workers/examples/english/mathematicians/euler.html +0 -21
@@ -4,9 +4,10 @@
4
4
  'bson_ext', 'mongo', 'lda-ruby',
5
5
  'stanford-core-nlp', 'linguistics',
6
6
  'ruby-readability', 'whatlanguage',
7
- 'chronic', 'nickel', 'decisiontree',
7
+ 'chronic', 'kronic', 'nickel', 'decisiontree',
8
8
  'rb-libsvm', 'ruby-fann', 'zip', 'loggability',
9
- 'tf-idf-similarity', 'narray', 'fastimage'
9
+ 'tf-idf-similarity', 'narray', 'fastimage',
10
+ 'fuzzy-string-match', 'levenshtein-ffi'
10
11
  ],
11
12
  workers: {
12
13
  learners: {
@@ -16,7 +17,9 @@
16
17
  keywords: [:tf_idf],
17
18
  language: [:what_language],
18
19
  topic_words: [:lda],
19
- tf_idf: [:native]
20
+ tf_idf: [:native],
21
+ distance: [:levenshtein],
22
+ similarity: [:jaro_winkler, :tf_idf]
20
23
  },
21
24
  formatters: {
22
25
  serializers: [:xml, :yaml, :mongo],
@@ -14,7 +14,7 @@
14
14
  ],
15
15
  workers: {
16
16
  extractors: {
17
- time: [:chronic, :ruby, :nickel],
17
+ time: [:chronic, :kronic, :ruby, :nickel],
18
18
  topics: [:reuters],
19
19
  name_tag: [:stanford]
20
20
  },
@@ -27,5 +27,13 @@
27
27
  tf_idf: {
28
28
  type: :annotator,
29
29
  targets: [:word]
30
+ },
31
+ similarity: {
32
+ type: :computer,
33
+ targets: [:entity]
34
+ },
35
+ distance: {
36
+ type: :computer,
37
+ targets: [:entity]
30
38
  }
31
39
  }
@@ -42,6 +42,8 @@ class Treat::Loaders::Stanford
42
42
  StanfordCoreNLP.log_file = '/dev/null'
43
43
  end
44
44
 
45
+ StanfordCoreNLP.bind
46
+
45
47
  @@loaded = true
46
48
 
47
49
  end
data/lib/treat/version.rb CHANGED
@@ -1,7 +1,7 @@
1
1
  module Treat
2
2
 
3
3
  # The current version of Treat.
4
- VERSION = "2.0.3"
4
+ VERSION = "2.0.4"
5
5
 
6
6
  # Treat requires Ruby >= 1.9.2
7
7
  if RUBY_VERSION < '1.9.2'
@@ -0,0 +1,35 @@
1
+ # The C extension uses char* strings, and so Unicode strings
2
+ # will give incorrect distances. Need to provide a pure
3
+ # implementation if that's the case (FIX).
4
+ class Treat::Workers::Extractors::Distance::Levenshtein
5
+
6
+ require 'levenshtein'
7
+
8
+ DefaultOptions = {
9
+ ins_cost: 1,
10
+ del_cost: 1,
11
+ sub_cost: 1
12
+ }
13
+
14
+ @@matcher = nil
15
+
16
+ # Return the levensthein distance between
17
+ # two strings taking into account the costs
18
+ # of insertion, deletion, and substitution.
19
+ def self.distance(entity, options)
20
+
21
+ options = DefaultOptions.merge(options)
22
+
23
+ unless options[:to]
24
+ raise Treat::Exception, "Must supply " +
25
+ "a string/entity to compare to using " +
26
+ "the option :to for this worker."
27
+ end
28
+
29
+ a, b = entity.to_s, options[:to].to_s
30
+
31
+ Levenshtein.distance(a, b)
32
+
33
+ end
34
+
35
+ end
@@ -1,7 +1,7 @@
1
1
  # Named entity tag extraction using the Stanford NLP
2
2
  # Deterministic Coreference Resolver, which implements a
3
3
  # multi-pass sieve coreference resolution (or anaphora
4
- # resolution) system.
4
+ # resolution) system based on conditional random fields.
5
5
  #
6
6
  # Original paper: Heeyoung Lee, Yves Peirsman, Angel
7
7
  # Chang, Nathanael Chambers, Mihai Surdeanu, Dan Jurafsky.
@@ -24,6 +24,9 @@ class Treat::Workers::Extractors::NameTag::Stanford
24
24
 
25
25
  unless classifier = @@classifiers[language]
26
26
  model = Treat::Loaders::Stanford.find_model(:ner, language)
27
+ unless StanfordCoreNLP.const_defined?('CRFClassifier')
28
+ StanfordCoreNLP.load_class('CRFClassifier', 'edu.stanford.nlp.ie.crf')
29
+ end
27
30
  classifier = StanfordCoreNLP::CRFClassifier.getClassifier(model)
28
31
  @@classifiers[language] = classifier
29
32
  end
@@ -0,0 +1,38 @@
1
+ # Similarity measure for short strings such as person names.
2
+ # C extension won't work for Unicode strings; need to set
3
+ # extension to "pure" in that case (FIX).
4
+ class Treat::Workers::Extractors::Similarity::JaroWinkler
5
+
6
+ require 'fuzzystringmatch'
7
+
8
+ DefaultOptions = {
9
+ threshold: 0.7,
10
+ implementation: nil
11
+ }
12
+
13
+ @@matcher = nil
14
+
15
+ def self.similarity(entity, options={})
16
+
17
+ options = DefaultOptions.merge(options)
18
+
19
+ unless options[:to]
20
+ raise Treat::Exception, "Must supply " +
21
+ "a string/entity to compare to using " +
22
+ "the option :to for this worker."
23
+ end
24
+
25
+ unless @@matcher
26
+ impl = options[:implementation]
27
+ impl ||= defined?(JRUBY_VERSION) ? :pure : :native
28
+ klass = FuzzyStringMatch::JaroWinkler
29
+ @@matcher = klass.create(impl)
30
+ end
31
+
32
+ a, b = entity.to_s, options[:to].to_s
33
+
34
+ @@matcher.getDistance(a, b)
35
+
36
+ end
37
+
38
+ end
@@ -2,12 +2,28 @@
2
2
  class Treat::Workers::Extractors::Similarity::TfIdf
3
3
 
4
4
  require 'tf-idf-similarity'
5
+
6
+ def self.similarity(entity, options={})
5
7
 
6
- @collections = {}
8
+ raise 'Not currently implemented.'
9
+
10
+ unless options[:to] &&
11
+ options[:to].type == :document
12
+ raise Treat::Exception, 'Must supply ' +
13
+ 'a document to compare to using ' +
14
+ 'the option :to for this worker.'
15
+ end
7
16
 
8
- def self.tf_idf(collection, options={})
17
+ unless options[:to].parent_collection &&
18
+ entity.parent_collection
19
+ raise Treat::Exception, 'The TF*IDF ' +
20
+ 'similarity algorithm can only be applied ' +
21
+ 'to documents that are inside collections.'
22
+ end
23
+
9
24
  coll = TfIdfSimilarity::Collection.new
10
- collection.each_document do |doc|
25
+
26
+ entity.each_document do |doc|
11
27
  tdoc = TfIdfSimilarity::Document.new(doc.to_s)
12
28
  term_counts = Hash.new(0)
13
29
  doc.each_word do |word|
@@ -2,52 +2,17 @@
2
2
  # Ruby natural language date parser.
3
3
  class Treat::Workers::Extractors::Time::Chronic
4
4
 
5
- # Require the 'chronic' gem.
6
- silence_warnings { require 'chronic' }
7
-
8
- # Require the Ruby DateTime module
5
+ require 'chronic'
9
6
  require 'date'
7
+
8
+ DefaultOptions = {guess: true}
10
9
 
11
10
  # Return the date information contained within
12
11
  # the entity by parsing it with the 'chronic' gem.
13
- #
14
- # Options: none.
15
12
  def self.time(entity, options = {})
16
-
17
- s = entity.to_s
18
- return if s =~ /^[0-9]+$/
19
- time = nil
20
-
21
- silence_warnings do
22
- time = ::Chronic.parse(s, {:guess => true})
23
- end
24
-
25
- if entity.has_parent? && remove_time_from_ancestors(entity, time)
26
- nil
27
- else
28
- time
29
- end
30
-
31
- end
32
-
33
- # Keeps the lowest-level time annotations that do
34
- # not conflict with a higher time annotation.
35
- # Returns true if the entity conflicts with a
36
- # higher-level time annotation.
37
- def self.remove_time_from_ancestors(entity, time)
38
-
39
- entity.ancestors_with_type(:phrase).each do |a|
40
-
41
- next if !a.has?(:time)
42
- unless a.get(:time) == time
43
- return true
44
- end
45
- a.unset(:time)
46
-
47
- end
48
-
49
- false
50
-
13
+ options = DefaultOptions.merge(options)
14
+ time = ::Chronic.parse(entity.to_s, options)
15
+ time ? DateTime.parse(time.to_s) : nil
51
16
  end
52
17
 
53
18
  end
@@ -0,0 +1,20 @@
1
+ # Time/date extraction using a simple rule-based library.
2
+ #
3
+ # Supported formats: Today, yesterday, tomorrow,
4
+ # last thursday, this thursday, 14 Sep, 14 June 2010.
5
+ # Any dates without a year are assumed to be in the past.
6
+ class Treat::Workers::Extractors::Time::Kronic
7
+
8
+ require 'kronic'
9
+ require 'date'
10
+
11
+ # Return the date information contained within
12
+ # the entity by parsing it with the 'chronic' gem.
13
+ #
14
+ # Options: none.
15
+ def self.time(entity, options = {})
16
+ time = Kronic.parse(entity.to_s)
17
+ time.is_a?(DateTime) ? time : nil
18
+ end
19
+
20
+ end
@@ -47,7 +47,6 @@ class Treat::Workers::Extractors::Time::Nickel
47
47
  occ.interval : :none
48
48
  time_recurrence_interval = interval
49
49
 
50
-
51
50
  s = [occ.start_date, occ.start_time]
52
51
  ds = [s[0].year, s[0].month, s[0].day] if s[0]
53
52
  ts = [s[1].hour, s[1].minute, s[1].second] if s[1]
@@ -77,18 +76,4 @@ class Treat::Workers::Extractors::Time::Nickel
77
76
 
78
77
  end
79
78
 
80
- # Keeps the lowest-level time annotations that do
81
- # not conflict with a higher time annotation.
82
- # Returns true if the entity conflicts with a
83
- # higher-level time annotation.
84
- def self.remove_time_from_ancestors(entity, time)
85
- entity.ancestors_with_type(:phrase).each do |a|
86
- next if !a.has?(:time)
87
- return false unless a.get(:time).to_s == time.to_s
88
- a.unset(:time, :time_recurrence,
89
- :time_recurrence_interval, :end_time)
90
- end
91
- true
92
- end
93
-
94
79
  end
@@ -2,7 +2,7 @@
2
2
  # DateTime.parse() method.
3
3
  class Treat::Workers::Extractors::Time::Ruby
4
4
 
5
- # Require Ruby's date module.
5
+
6
6
  require 'date'
7
7
 
8
8
  # Return a DateTime object representing the date/time
@@ -13,42 +13,11 @@ class Treat::Workers::Extractors::Time::Ruby
13
13
  #
14
14
  # Options: none.
15
15
  def self.time(entity, options = {})
16
- s = entity.to_s
17
- return if s =~ /^[0-9]+$/
18
16
  begin
19
- time = ::DateTime.parse(s)
20
- if entity.has_parent? &&
21
- remove_time_from_ancestors(entity, time)
22
- nil
23
- else
24
- time
25
- end
17
+ DateTime.parse(entity.to_s)
26
18
  rescue
27
19
  nil
28
20
  end
29
21
  end
30
22
 
31
-
32
- # Keeps the lowest-level time annotations that do
33
- # not conflict with a higher time annotation.
34
- # Returns true if the entity conflicts with a
35
- # higher-level time annotation.
36
- def self.remove_time_from_ancestors(entity, time)
37
-
38
- entity.ancestors_with_type(:phrase).each do |a|
39
-
40
- next if !a.has?(:time)
41
-
42
- unless a.get(:time) == time
43
- return true
44
- end
45
-
46
- a.unset(:time)
47
-
48
- end
49
-
50
- false
51
-
52
- end
53
-
54
23
  end
@@ -1,10 +1,10 @@
1
- # POS tagging using (i) explicit use of both preceding
2
- # and following tag contexts via a dependency network
3
- # representation, (ii) broad use of lexical features,
4
- # including jointly conditioning on multiple consecutive
5
- # words, (iii) effective use of priors in conditional
6
- # loglinear models, and (iv) fine-grained modeling of
7
- # unknown word features.
1
+ # POS tagging using a maximum entropy model, with (i)
2
+ # explicit use of both preceding and following tag
3
+ # contexts via a dependency network representation,
4
+ # (ii) broad use of lexical features, including jointly
5
+ # conditioning on multiple consecutive words, (iii)
6
+ # effective use of priors in conditional loglinear models,
7
+ # and (iv) fine-grained modeling of unknown word features.
8
8
  #
9
9
  # Original paper: Toutanova, Manning, Klein and Singer.
10
10
  # 2003. Feature-Rich Part-of-Speech Tagging with a
@@ -21,9 +21,6 @@ class Treat::Workers::Lexicalizers::Taggers::Stanford
21
21
  :tagger_model => nil
22
22
  }
23
23
 
24
- # Shortcut for gem config.
25
- Config = StanfordCoreNLP::Config
26
-
27
24
  # Tag the word using one of the Stanford taggers.
28
25
  def self.tag(entity, options = {})
29
26
 
@@ -64,6 +61,10 @@ class Treat::Workers::Lexicalizers::Taggers::Stanford
64
61
  def self.init_tagger(language)
65
62
  unless @@taggers[language]
66
63
  Treat::Loaders::Stanford.load(language)
64
+ unless StanfordCoreNLP.const_defined?('MaxentTagger')
65
+ StanfordCoreNLP.load_class('MaxentTagger',
66
+ 'edu.stanford.nlp.tagger.maxent')
67
+ end
67
68
  model = Treat::Loaders::Stanford.find_model(:pos,language)
68
69
  tagger = StanfordCoreNLP::MaxentTagger.new(model)
69
70
  @@taggers[language] = tagger
@@ -1,140 +1,88 @@
1
- # Parsing using an interface to a Java implementation
2
- # of probabilistic natural language parsers, both
3
- # optimized PCFG and lexicalized dependency parsers,
4
- # and a lexicalized PCFG parser.
5
- #
6
- # Original paper: Dan Klein and Christopher D.
7
- # Manning. 2003. Accurate Unlexicalized Parsing.
8
- # Proceedings of the 41st Meeting of the Association
1
+ # Parsing using an interface to a Java implementation
2
+ # of probabilistic natural language parsers, both
3
+ # optimized PCFG and lexicalized dependency parsers,
4
+ # and a lexicalized PCFG parser.
5
+ #
6
+ # Original paper: Dan Klein and Christopher D.
7
+ # Manning. 2003. Accurate Unlexicalized Parsing.
8
+ # Proceedings of the 41st Meeting of the Association
9
9
  # for Computational Linguistics, pp. 423-430.
10
10
  class Treat::Workers::Processors::Parsers::Stanford
11
-
11
+
12
12
  Pttc = Treat.tags.aligned.phrase_tags_to_category
13
-
13
+
14
14
  # Hold one instance of the pipeline per language.
15
15
  @@parsers = {}
16
16
 
17
- DefaultOptions = {
18
- :parser_model => nil,
19
- :tagger_model => nil
20
- }
17
+ DefaultOptions = { model: nil }
21
18
 
22
19
  # Parse the entity using the Stanford parser.
23
- #
24
- # Options:
25
- #
26
- # - (Boolean) :silent => whether to silence the output
27
- # of the JVM.
28
- # - (String) :log_file => a filename to log output to
29
- # instead of displaying it.
30
20
  def self.parse(entity, options = {})
31
21
 
32
- val, lang = entity.to_s, entity.language
33
- init(lang, options) unless @@parsers[lang]
34
-
35
- entity.check_hasnt_children
22
+ val, lang = entity.to_s, entity.language.intern
23
+
24
+ Treat::Loaders::Stanford.load(lang)
36
25
 
37
26
  tag_set = StanfordCoreNLP::Config::TagSets[lang]
38
27
 
39
- text = ::StanfordCoreNLP::Annotation.new(val)
40
- @@parsers[lang].annotate(text)
41
-
42
- text.get(:sentences).each do |s|
43
-
44
- if entity.is_a?(Treat::Entities::Sentence) ||
45
- entity.is_a?(Treat::Entities::Phrase)
46
- tag = s.get(:category).to_s
47
- tag_s, tag_opt = *tag.split('-')
48
- tag_s ||= 'S'
49
- entity.set :tag, tag_s
50
- entity.set :tag_opt, tag_opt if tag_opt
51
- recurse(s.get(:tree).children[0], entity, tag_set)
52
- break ####### ? FIX
53
- else
54
- recurse(s.get(:tree), entity, tag_set)
55
- end
56
-
28
+ list = get_token_list(entity)
29
+ entity.remove_all!
30
+
31
+ model_file = options[:model] ||
32
+ StanfordCoreNLP::Config::Models[:parse][lang]
33
+
34
+ unless @@parsers[lang] && @@parsers[lang][model_file]
35
+ model_path = Treat.libraries.stanford.model_path ||
36
+ StanfordCoreNLP.model_path
37
+ model_folder = StanfordCoreNLP::Config::ModelFolders[:parse]
38
+ model = File.join(model_path, model_folder, model_file)
39
+ @@parsers[lang] ||= {}
40
+ options = StanfordCoreNLP::Options.new
41
+ parser = StanfordCoreNLP::LexicalizedParser
42
+ .getParserFromFile(model, options)
43
+ @@parsers[lang][model_file] = parser
57
44
  end
58
-
59
- entity.set :tag_set, tag_set
60
45
 
61
- end
46
+ parser = @@parsers[lang][model_file]
47
+
48
+ text = parser.apply(list)
49
+
50
+ recurse(text.children[0], entity, tag_set)
51
+ entity.set :tag_set, tag_set
62
52
 
63
- def self.init(lang, options)
64
- Treat::Loaders::Stanford.load(lang)
65
- options = DefaultOptions.merge(options)
66
- StanfordCoreNLP.use(lang.intern)
67
- if options[:tagger_model]
68
- StanfordCoreNLP.set_model('pos.model', options[:tagger_model])
69
- end
70
- if options[:parser_model]
71
- StanfordCoreNLP.set_model('parser.model', options[:parser_model])
72
- end
73
- annotators = [:tokenize, :ssplit, :pos, :lemma, :parse]
74
- @@parsers[lang] = StanfordCoreNLP.load(*annotators)
75
53
  end
76
54
 
77
- # Helper method which recurses the tree supplied by
78
- # the Stanford parser.
79
- def self.recurse(java_node, ruby_node, tag_set, additional_tags = [])
80
-
81
- if java_node.num_children == 0
82
-
83
- label = java_node.label
84
- tag = label.get(:part_of_speech).to_s
85
- tag_s, tag_opt = *tag.split('-')
86
- tag_s ||= ''
87
- ruby_node.value = java_node.value.to_s.strip
88
- ruby_node.set :tag, tag_s
89
- ruby_node.set :tag_opt, tag_opt if tag_opt
90
- ruby_node.set :lemma, label.get(:lemma).to_s
91
-
92
- additional_tags.each do |t|
93
- lt = label.get(t)
94
- ruby_node.set t, lt.to_s if lt
95
- end
96
-
97
- ruby_node
98
-
99
- else
100
-
101
- if java_node.num_children == 1 &&
102
- java_node.children[0].num_children == 0
103
- recurse(java_node.children[0],
104
- ruby_node, tag_set, additional_tags)
105
- return
106
- end
55
+ def self.recurse(java_node, ruby_node, tag_set)
56
+
57
+ java_node.children.each do |java_child|
107
58
 
108
- java_node.children.each do |java_child|
109
-
110
- label = java_child.label
111
- tag = label.get(:category).to_s
112
- tag_s, tag_opt = *tag.split('-')
113
- tag_s ||= ''
114
-
115
- if Pttc[tag_s] && Pttc[tag_s][tag_set]
116
- ruby_child = Treat::Entities::Phrase.new
117
- else
118
- l = java_child.children[0].to_s
119
- v = java_child.children[0].value.to_s.strip
120
-
121
- # Mhmhmhmhmhm FIX!
122
- val = (l == v) ? v : l.split(' ')[-1].gsub(')', '')
123
- ruby_child = Treat::Entities::Token.from_string(val)
124
- end
59
+ label = java_child.label
60
+ tag = label.get(:category).to_s
125
61
 
126
- ruby_child.set :tag, tag_s
127
- ruby_child.set :tag_opt, tag_opt if tag_opt
62
+ if Pttc[tag] && Pttc[tag][tag_set]
63
+ ruby_child = Treat::Entities::Phrase.new
64
+ ruby_child.set :tag, tag
128
65
  ruby_node << ruby_child
129
-
130
66
  unless java_child.children.empty?
131
- recurse(java_child, ruby_child, tag_set, additional_tags)
67
+ recurse(java_child, ruby_child, tag_set)
132
68
  end
133
-
69
+ else
70
+ val = java_child.children[0].to_s
71
+ ruby_child = Treat::Entities::Token.from_string(val)
72
+ ruby_child.set :tag, tag
73
+ ruby_node << ruby_child
134
74
  end
135
-
75
+
136
76
  end
137
77
 
138
78
  end
139
-
79
+
80
+ def self.get_token_list(entity)
81
+ list = StanfordCoreNLP::ArrayList.new
82
+ entity.tokens.each do |token|
83
+ list.add(StanfordCoreNLP::Word.new(token.to_s))
84
+ end
85
+ list
86
+ end
87
+
140
88
  end