treat 0.1.1

Sign up to get free protection for your applications and to get access to all the features.
Files changed (147) hide show
  1. data/INSTALL +0 -0
  2. data/LICENSE +28 -0
  3. data/README +0 -0
  4. data/TODO +67 -0
  5. data/bin/INFO +1 -0
  6. data/examples/benchmark.rb +81 -0
  7. data/examples/keywords.rb +60 -0
  8. data/examples/texts/bugged_out.txt +26 -0
  9. data/examples/texts/half_cocked_basel.txt +16 -0
  10. data/examples/texts/hedge_funds.txt +24 -0
  11. data/examples/texts/hose_and_dry.txt +19 -0
  12. data/examples/texts/hungarys_troubles.txt +46 -0
  13. data/examples/texts/indias_slowdown.txt +15 -0
  14. data/examples/texts/merkozy_rides_again.txt +24 -0
  15. data/examples/texts/prada_is_not_walmart.txt +9 -0
  16. data/examples/texts/republican_nomination.txt +26 -0
  17. data/examples/texts/to_infinity_and_beyond.txt +15 -0
  18. data/lib/treat.rb +91 -0
  19. data/lib/treat/buildable.rb +115 -0
  20. data/lib/treat/categories.rb +29 -0
  21. data/lib/treat/category.rb +28 -0
  22. data/lib/treat/delegatable.rb +90 -0
  23. data/lib/treat/detectors.rb +28 -0
  24. data/lib/treat/detectors/encoding/native.rb +12 -0
  25. data/lib/treat/detectors/encoding/r_chardet19.rb +24 -0
  26. data/lib/treat/detectors/format/file.rb +36 -0
  27. data/lib/treat/detectors/language/language_detector.rb +19 -0
  28. data/lib/treat/detectors/language/what_language.rb +29 -0
  29. data/lib/treat/entities.rb +52 -0
  30. data/lib/treat/entities/collection.rb +19 -0
  31. data/lib/treat/entities/constituents.rb +15 -0
  32. data/lib/treat/entities/document.rb +11 -0
  33. data/lib/treat/entities/entity.rb +242 -0
  34. data/lib/treat/entities/sentence.rb +8 -0
  35. data/lib/treat/entities/text.rb +7 -0
  36. data/lib/treat/entities/tokens.rb +37 -0
  37. data/lib/treat/entities/zones.rb +17 -0
  38. data/lib/treat/exception.rb +5 -0
  39. data/lib/treat/extractors.rb +41 -0
  40. data/lib/treat/extractors/key_sentences/topics_frequency.rb +49 -0
  41. data/lib/treat/extractors/named_entity/abner.rb +20 -0
  42. data/lib/treat/extractors/named_entity/stanford.rb +174 -0
  43. data/lib/treat/extractors/statistics/frequency.rb +22 -0
  44. data/lib/treat/extractors/statistics/frequency_of.rb +17 -0
  45. data/lib/treat/extractors/statistics/position_in.rb +13 -0
  46. data/lib/treat/extractors/statistics/transition_matrix.rb +105 -0
  47. data/lib/treat/extractors/statistics/transition_probability.rb +53 -0
  48. data/lib/treat/extractors/time/chronic.rb +12 -0
  49. data/lib/treat/extractors/time/native.rb +12 -0
  50. data/lib/treat/extractors/time/nickel.rb +45 -0
  51. data/lib/treat/extractors/topic_words/lda.rb +71 -0
  52. data/lib/treat/extractors/topic_words/lda/data.dat +46 -0
  53. data/lib/treat/extractors/topic_words/lda/wiki.yml +121 -0
  54. data/lib/treat/extractors/topics/reuters.rb +91 -0
  55. data/lib/treat/extractors/topics/reuters/industry.xml +2717 -0
  56. data/lib/treat/extractors/topics/reuters/region.xml +13585 -0
  57. data/lib/treat/extractors/topics/reuters/topics.xml +17977 -0
  58. data/lib/treat/feature.rb +53 -0
  59. data/lib/treat/formatters.rb +44 -0
  60. data/lib/treat/formatters/cleaners/html.rb +17 -0
  61. data/lib/treat/formatters/readers/autoselect.rb +35 -0
  62. data/lib/treat/formatters/readers/gocr.rb +24 -0
  63. data/lib/treat/formatters/readers/html.rb +13 -0
  64. data/lib/treat/formatters/readers/ocropus.rb +31 -0
  65. data/lib/treat/formatters/readers/pdf.rb +17 -0
  66. data/lib/treat/formatters/readers/txt.rb +15 -0
  67. data/lib/treat/formatters/serializers/xml.rb +48 -0
  68. data/lib/treat/formatters/serializers/yaml.rb +15 -0
  69. data/lib/treat/formatters/serializers/yaml/helper.rb +96 -0
  70. data/lib/treat/formatters/unserializers/autoselect.rb +19 -0
  71. data/lib/treat/formatters/unserializers/xml.rb +79 -0
  72. data/lib/treat/formatters/unserializers/yaml.rb +15 -0
  73. data/lib/treat/formatters/visualizers/dot.rb +73 -0
  74. data/lib/treat/formatters/visualizers/html.rb +12 -0
  75. data/lib/treat/formatters/visualizers/inspect.rb +16 -0
  76. data/lib/treat/formatters/visualizers/short_value.rb +14 -0
  77. data/lib/treat/formatters/visualizers/standoff.rb +41 -0
  78. data/lib/treat/formatters/visualizers/tree.rb +28 -0
  79. data/lib/treat/formatters/visualizers/txt.rb +31 -0
  80. data/lib/treat/group.rb +96 -0
  81. data/lib/treat/inflectors.rb +50 -0
  82. data/lib/treat/inflectors/cardinal_words/linguistics.rb +45 -0
  83. data/lib/treat/inflectors/conjugators/linguistics.rb +30 -0
  84. data/lib/treat/inflectors/declensors/en.rb +18 -0
  85. data/lib/treat/inflectors/declensors/linguistics.rb +30 -0
  86. data/lib/treat/inflectors/lemmatizers/e_lemma.rb +12 -0
  87. data/lib/treat/inflectors/lemmatizers/e_lemma/Makefile +213 -0
  88. data/lib/treat/inflectors/lemmatizers/e_lemma/elemma.c +68 -0
  89. data/lib/treat/inflectors/lemmatizers/e_lemma/extconf.rb +6 -0
  90. data/lib/treat/inflectors/ordinal_words/linguistics.rb +21 -0
  91. data/lib/treat/inflectors/stemmers/porter.rb +158 -0
  92. data/lib/treat/inflectors/stemmers/porter_c.rb +23 -0
  93. data/lib/treat/inflectors/stemmers/uea.rb +30 -0
  94. data/lib/treat/lexicalizers.rb +49 -0
  95. data/lib/treat/lexicalizers/category/from_tag.rb +30 -0
  96. data/lib/treat/lexicalizers/linkages/naive.rb +63 -0
  97. data/lib/treat/lexicalizers/synsets/rita_wn.rb +23 -0
  98. data/lib/treat/lexicalizers/synsets/wordnet.rb +72 -0
  99. data/lib/treat/lexicalizers/tag/brill.rb +101 -0
  100. data/lib/treat/lexicalizers/tag/lingua.rb +114 -0
  101. data/lib/treat/lexicalizers/tag/stanford.rb +86 -0
  102. data/lib/treat/processors.rb +45 -0
  103. data/lib/treat/processors/chunkers/txt.rb +27 -0
  104. data/lib/treat/processors/parsers/enju.rb +214 -0
  105. data/lib/treat/processors/parsers/stanford.rb +60 -0
  106. data/lib/treat/processors/segmenters/punkt.rb +48 -0
  107. data/lib/treat/processors/segmenters/stanford.rb +45 -0
  108. data/lib/treat/processors/segmenters/tactful.rb +34 -0
  109. data/lib/treat/processors/tokenizers/macintyre.rb +76 -0
  110. data/lib/treat/processors/tokenizers/multilingual.rb +31 -0
  111. data/lib/treat/processors/tokenizers/perl.rb +96 -0
  112. data/lib/treat/processors/tokenizers/punkt.rb +42 -0
  113. data/lib/treat/processors/tokenizers/stanford.rb +33 -0
  114. data/lib/treat/processors/tokenizers/tactful.rb +59 -0
  115. data/lib/treat/proxies.rb +66 -0
  116. data/lib/treat/registrable.rb +26 -0
  117. data/lib/treat/resources.rb +10 -0
  118. data/lib/treat/resources/categories.rb +18 -0
  119. data/lib/treat/resources/delegates.rb +96 -0
  120. data/lib/treat/resources/dependencies.rb +0 -0
  121. data/lib/treat/resources/edges.rb +8 -0
  122. data/lib/treat/resources/formats.rb +23 -0
  123. data/lib/treat/resources/languages.rb +86 -0
  124. data/lib/treat/resources/languages.txt +504 -0
  125. data/lib/treat/resources/tags.rb +393 -0
  126. data/lib/treat/sugar.rb +43 -0
  127. data/lib/treat/tree.rb +174 -0
  128. data/lib/treat/utilities.rb +127 -0
  129. data/lib/treat/visitable.rb +27 -0
  130. data/test/profile.rb +2 -0
  131. data/test/tc_detectors.rb +27 -0
  132. data/test/tc_entity.rb +105 -0
  133. data/test/tc_extractors.rb +48 -0
  134. data/test/tc_formatters.rb +46 -0
  135. data/test/tc_inflectors.rb +39 -0
  136. data/test/tc_lexicalizers.rb +39 -0
  137. data/test/tc_processors.rb +36 -0
  138. data/test/tc_resources.rb +27 -0
  139. data/test/tc_treat.rb +64 -0
  140. data/test/tc_tree.rb +60 -0
  141. data/test/tests.rb +19 -0
  142. data/test/texts.rb +20 -0
  143. data/test/texts/english/long.html +24 -0
  144. data/test/texts/english/long.txt +22 -0
  145. data/test/texts/english/medium.txt +5 -0
  146. data/test/texts/english/short.txt +3 -0
  147. metadata +412 -0
@@ -0,0 +1,17 @@
1
+ module Treat
2
+ module Entities
3
+ # Represents a zone of text
4
+ # (Title, Paragraph, List, Quote).
5
+ class Zone < Entity
6
+ end
7
+ # Represents a title, subtitle, logical header.
8
+ class Title < Zone
9
+ end
10
+ # Represents a paragraph.
11
+ class Paragraph < Zone
12
+ end
13
+ # Represents a list.
14
+ class List < Zone
15
+ end
16
+ end
17
+ end
@@ -0,0 +1,5 @@
1
+ module Treat
2
+ # Exception class for the Treat library.
3
+ class Exception < ::Exception
4
+ end
5
+ end
@@ -0,0 +1,41 @@
1
+ module Treat
2
+ # Extractors extract specific information out of texts.
3
+ module Extractors
4
+ # Extracts a DateTime object containing a timestamp
5
+ # from string representation of date/time.
6
+ module Time
7
+ extend Group
8
+ self.type = :computer
9
+ self.targets = [:word, :constituent, :symbol]
10
+ end
11
+ # Extract the topic from a text.
12
+ module Topics
13
+ extend Group
14
+ self.type = :annotator
15
+ self.targets = [:collection, :document, :text, :zone, :sentence]
16
+ end
17
+ # Extract the topic from a text.
18
+ module TopicWords
19
+ extend Group
20
+ self.type = :annotator
21
+ self.targets = [:collection, :document, :text, :zone, :sentence]
22
+ end
23
+ module Statistics
24
+ extend Group
25
+ self.type = :computer
26
+ self.targets = [:entity]
27
+ self.default = :none
28
+ end
29
+ module NamedEntity
30
+ extend Group
31
+ self.type = :computer
32
+ self.targets = [:entity]
33
+ end
34
+ module KeySentences
35
+ extend Group
36
+ self.type = :computer
37
+ self.targets = [:collection, :document, :text, :zone, :sentence]
38
+ end
39
+ extend Treat::Category
40
+ end
41
+ end
@@ -0,0 +1,49 @@
1
+ module Treat
2
+ module Extractors
3
+ module KeySentences
4
+ class TopicsFrequency
5
+
6
+ def self.key_sentences(entity, options = {})
7
+ options[:threshold] ||= 4
8
+ @@topics = options[:topic_words]
9
+ if Treat::Entities.rank(entity.type) <
10
+ Treat::Entities.rank(:sentence)
11
+ raise Treat::Exception, 'Cannot get the key ' +
12
+ 'sentences of an entity smaller than a sentence.'
13
+ else
14
+ sentence_scores = {}
15
+ sentences = []
16
+ entity.each_sentence do |sentence|
17
+ sentence_scores[sentence.id] = score_sentence(sentence)
18
+ end
19
+ sentence_scores.each do |sid, score|
20
+ if score >= options[:threshold]
21
+ s = entity.find(sid)
22
+ s.set :is_key_sentence?, true
23
+ sentences << s
24
+ end
25
+ end
26
+ end
27
+ sentences
28
+ end
29
+
30
+ def self.score_sentence(sentence)
31
+ sentence.set :topic_score, 0
32
+ sentence.each_word do |word|
33
+ found = false
34
+ @@topics.each do |i, topic_words|
35
+ if topic_words.include?(word.to_s)
36
+ sentence.set :topic_score,
37
+ (sentence.topic_score + 1)
38
+ found = true
39
+ end
40
+ end
41
+ word.set :is_keyword?, found
42
+ end
43
+ sentence.topic_score
44
+ end
45
+
46
+ end
47
+ end
48
+ end
49
+ end
@@ -0,0 +1,20 @@
1
+ module Treat
2
+ module Extractors
3
+ module NamedEntity
4
+ class Abner
5
+ # Require the Ruby-Java bridge.
6
+ silently do
7
+ require 'rjb'
8
+ Rjb::load('', ['-Xms256M', '-Xmx512M'])
9
+ puts Rjb.import('tagger')
10
+ end
11
+ @@tagger = nil
12
+ def self.named_entity(entity)
13
+ @@tagger ||= AbnerTagger.new
14
+ @@tagger.tokenize(entity)
15
+ end
16
+ end
17
+ end
18
+ end
19
+ end
20
+
@@ -0,0 +1,174 @@
1
+ module Treat
2
+ module Extractors
3
+ module NamedEntity
4
+ class Stanford
5
+ # Require the Ruby-Java bridge.
6
+ silently do
7
+ require 'rjb'
8
+ Rjb::load(nil, ['-Xms256M', '-Xmx1024M'])
9
+ Rjb::add_jar('/ruby/treat/bin/treat/treat.jar')
10
+ Rjb::add_jar('/ruby/treat/bin/stanford/xom.jar')
11
+ Rjb::add_jar('/ruby/treat/bin/stanford/joda-time.jar')
12
+ Rjb::add_jar('/ruby/treat/bin/stanford/stanford-corenlp.jar')
13
+ StanfordCoreNLP = Rjb::import('edu.stanford.nlp.pipeline.StanfordCoreNLP')
14
+ Annotation = Rjb::import('edu.stanford.nlp.pipeline.Annotation')
15
+ NamedEntityTagAnnotation = Rjb::import('edu.stanford.nlp.ling.CoreAnnotations$NamedEntityTagAnnotation')
16
+ Properties = Rjb::import('java.util.Properties')
17
+ end
18
+ @@classifier = nil
19
+ def self.named_entity(entity, options = {})
20
+ properties = Properties.new
21
+ properties.set_property('annotators', 'tokenize, ssplit, pos, lemma, ner')
22
+ properties.set_property('pos.model', '/ruby/treat/bin/stanford/taggers/english-left3words-distsim.tagger')
23
+ properties.set_property('ner.model.3class', '/ruby/treat/bin/stanford/classifiers/all.3class.distsim.crf.ser.gz')
24
+ properties.set_property('ner.model.7class', '/ruby/treat/bin/stanford/classifiers/muc.7class.distsim.crf.ser.gz')
25
+ properties.set_property('ner.model.MISCclass', '/ruby/treat/bin/stanford/classifiers/conll.4class.distsim.crf.ser.gz')
26
+ properties.set_property('parser.model', '/ruby/treat/bin/stanford_parser/grammar/englishPCFG.ser.gz')
27
+ silence_stream(STDOUT) do
28
+ pipeline = StanfordCoreNLP.new(properties)
29
+ end
30
+ stanford_entity = Annotation.new(entity.to_s)
31
+ pipeline.annotate(stanford_entity)
32
+ puts stanford_entity.java_methods
33
+ puts stanford_entity.get_string(NamedEntityTagAnnotation)
34
+ end
35
+ end
36
+ end
37
+ end
38
+ end
39
+
40
+
41
+ =begin
42
+
43
+
44
+
45
+ CRFBiasedClassifier = Rjb::import('edu.stanford.nlp.ie.crf.CRFBiasedClassifier')
46
+ Properties = Rjb::import('java.util.Properties')
47
+ List = ::Rjb::import('java.util.ArrayList')
48
+ Word = ::Rjb::import('edu.stanford.nlp.ling.Word')
49
+ CoreAnnotations = ::Rjb::import('edu.stanford.nlp.ling.CoreAnnotations')
50
+ if @@classifier == nil
51
+ properties = Properties.new
52
+ options.each_pair do |option,value|
53
+ #properties.set_property('trainFile', )... Set the options.
54
+ end
55
+ @@classifier = CRFBiasedClassifier.new(properties)
56
+ @@classifier.load_classifier("/ruby/treat/bin/stanford_ner/classifiers/conll.4class.distsim.crf.ser.gz")
57
+ end
58
+ w = Word.new('Obama')
59
+ #puts @@classifier.java_methods
60
+ puts CoreAnnotations.public_methods.inspect
61
+ puts @@classifier.classify(w).get()
62
+
63
+
64
+ /*
65
+ * To change this template, choose Tools | Templates
66
+ * and open the template in the editor.
67
+ */
68
+
69
+ package corenlp;
70
+ import edu.stanford.nlp.ling.CoreAnnotations.CollapsedCCProcessedDependenciesAnnotation;
71
+ import edu.stanford.nlp.ling.CoreAnnotations.CorefGraphAnnotation;
72
+ import edu.stanford.nlp.ling.CoreAnnotations.NamedEntityTagAnnotation;
73
+ import edu.stanford.nlp.ling.CoreAnnotations.PartOfSpeechAnnotation;
74
+ import edu.stanford.nlp.ling.CoreAnnotations.SentencesAnnotation;
75
+ import edu.stanford.nlp.ling.CoreAnnotations.TextAnnotation;
76
+ import edu.stanford.nlp.ling.CoreAnnotations.TokensAnnotation;
77
+ import edu.stanford.nlp.ling.CoreAnnotations.TreeAnnotation;
78
+ import edu.stanford.nlp.ling.CoreLabel;
79
+ import edu.stanford.nlp.pipeline.*;
80
+ import edu.stanford.nlp.trees.Tree;
81
+ import edu.stanford.nlp.trees.semgraph.SemanticGraph;
82
+ import edu.stanford.nlp.util.CoreMap;
83
+ import edu.stanford.nlp.util.IntTuple;
84
+ import edu.stanford.nlp.util.Pair;
85
+ import edu.stanford.nlp.util.Timing;
86
+ import java.io.File;
87
+ import java.io.FileInputStream;
88
+ import java.io.IOException;
89
+ import java.util.ArrayList;
90
+ import java.util.List;
91
+
92
+ import java.util.Properties;
93
+ /**
94
+ *
95
+ * @author Karthi
96
+ */
97
+ public class Main {
98
+
99
+ /**
100
+ * @param args the command line arguments
101
+ */
102
+ public static void main(String[] args) throws IOException, ClassNotFoundException {
103
+ // // TODO code application liogic here
104
+ // System.out.println(System.getProperty("sun.arch.data.model"));
105
+ //// String str="-cp stanford-corenlp-2010-11-12.jar:stanford-corenlp-models-2010-11-06.jar:xom-1.2.6.jar:jgrapht-0.7.3.jar -Xms3g edu.stanford.nlp.pipeline.StanfordCoreNLP -file <input.txt>";
106
+ //// args=str.split(" ");
107
+ //// StanfordCoreNLP.main(args);
108
+ // Timing tim = new Timing();
109
+ // Properties props = null;
110
+ // props.setProperty("annotators", "ssplit, ner, parse, dcoref");
111
+ //
112
+ // StanfordCoreNLP pipeline = new StanfordCoreNLP(props);
113
+ // props = pipeline.getProperties();
114
+ // long setupTime = tim.report();
115
+ // String fileName = "input.txt";
116
+ // ArrayList<File> files=null;
117
+ // files.add(new File(filename));
118
+ // pipeline.processFiles(pipeline, files, props);
119
+ //
120
+ //
121
+
122
+
123
+ // creates a StanfordCoreNLP object, with POS tagging, lemmatization, NER, parsing, and coreference resolution
124
+ Properties props = new Properties();
125
+ FileInputStream in = new FileInputStream("Main.properties");
126
+
127
+ props.load(in);
128
+ in.close();
129
+ StanfordCoreNLP pipeline = new StanfordCoreNLP(props);
130
+
131
+ // read some text in the text variable
132
+ String text = "The doctor can consult with other doctors about this patient. If that is the case, the name of the doctor and the names of the consultants have to be maintained. Otherwise, only the name of the doctor is kept. "; // Add your text here!
133
+
134
+ // create an empty Annotation just with the given text
135
+ Annotation document = new Annotation(text);
136
+
137
+ // run all Annotators on this text
138
+ pipeline.annotate(document);
139
+ System.out.println(document);
140
+
141
+ // these are all the sentences in this document
142
+ // a CoreMap is essentially a Map that uses class objects as keys and has values with custom types
143
+ List<CoreMap> sentences = (List<CoreMap>) document.get(SentencesAnnotation.class);
144
+ System.out.println(sentences);
145
+ for(CoreMap sentence: sentences) {
146
+ // traversing the words in the current sentence
147
+ // a CoreLabel is a CoreMap with additional token-specific methods
148
+ for (CoreLabel token: sentence.get(TokensAnnotation.class)) {
149
+ // this is the text of the token
150
+ String word = token.get(TextAnnotation.class);
151
+ // this is the POS tag of the token
152
+ String pos = token.get(PartOfSpeechAnnotation.class);
153
+ // this is the NER label of the token
154
+ String ne = token.get(NamedEntityTagAnnotation.class);
155
+ }
156
+
157
+ // this is the parse tree of the current sentence
158
+ Tree tree = sentence.get(TreeAnnotation.class);
159
+ System.out.println(tree);
160
+ // this is the Stanford dependency graph of the current sentence
161
+ SemanticGraph dependencies = sentence.get(CollapsedCCProcessedDependenciesAnnotation.class);
162
+ System.out.println(dependencies);
163
+ }
164
+
165
+ // this is the coreference link graph
166
+ // each link stores an arc in the graph; the first element in the Pair is the source, the second is the target
167
+ // each node is stored as <sentence id, token id>. Both offsets start at 1!
168
+ List<Pair<IntTuple, IntTuple>> graph = document.get(CorefGraphAnnotation.class);
169
+ System.out.println(graph);
170
+
171
+ }
172
+
173
+ }
174
+ =end
@@ -0,0 +1,22 @@
1
+ module Treat
2
+ module Extractors
3
+ module Statistics
4
+ class Frequency
5
+ # Find the frequency of a given string value.
6
+ def self.statistics(entity, options={})
7
+ if entity.is_leaf?
8
+ w = entity.value.downcase
9
+ if entity.token_registry[:value][w].nil?
10
+ 0
11
+ else
12
+ entity.token_registry[:value][w].size
13
+ end
14
+ else
15
+ raise Treat::Exception,
16
+ 'Cannot get the frequency of a non-terminal entity.'
17
+ end
18
+ end
19
+ end
20
+ end
21
+ end
22
+ end
@@ -0,0 +1,17 @@
1
+ module Treat
2
+ module Extractors
3
+ module Statistics
4
+ class FrequencyOf
5
+ # Find the frequency of a given string value.
6
+ def self.statistics(entity, options = {})
7
+ w = options[:value]
8
+ if entity.token_registry[:value][w].nil?
9
+ 0
10
+ else
11
+ entity.token_registry[:value][w].size
12
+ end
13
+ end
14
+ end
15
+ end
16
+ end
17
+ end
@@ -0,0 +1,13 @@
1
+ module Treat
2
+ module Extractors
3
+ module Statistics
4
+ class PositionIn
5
+ # Find the position of the current entity
6
+ # inside the parent entity with type entity_type.
7
+ def self.statistics(entity)
8
+ raise Treat::Exception, 'Could you implement this?'
9
+ end
10
+ end
11
+ end
12
+ end
13
+ end
@@ -0,0 +1,105 @@
1
+ module Treat
2
+ module Extractors
3
+ module Statistics
4
+ class TransitionMatrix
5
+
6
+ # Find the transition matrix.
7
+ def self.statistics(entity, options={})
8
+
9
+ normalize = options[:normalize] || true
10
+ features = options[:features] || [:tag]
11
+ condition = options[:condition] || lambda { |e| true }
12
+ entity_types = options[:entity_types] ? options[:entity_types] :
13
+ [options[:entity_type]]
14
+ relationships = options[:relationships] ||
15
+ [:parent, :left, :right, :children]
16
+
17
+ # Create lambdas to generate the arrays.
18
+ empty_prototype = {}; features.each { |f| empty_prototype[f] = {} }
19
+ empty = lambda { Marshal.load(Marshal.dump(empty_prototype)) }
20
+ empty2_prototype = {}; relationships.each { |r| empty2_prototype[r] = empty.call }
21
+ empty2 = lambda { Marshal.load(Marshal.dump(empty2_prototype)) }
22
+
23
+ # Deep (recursive) merger.
24
+ merger = lambda do |key,v1,v2|
25
+ Hash === v1 && Hash === v2 ? v1.merge(v2, &merger) : v2
26
+ end
27
+
28
+ # Master matrix.
29
+ mm = nil
30
+
31
+ entity.each_entity(*entity_types) do |target|
32
+
33
+ next unless condition.call(target)
34
+
35
+ # Initialize the empty transition matrix.
36
+ tm = empty.call
37
+
38
+ # Calculate the transition probabilities.
39
+ features.each do |f1|
40
+
41
+ v1 = target.send(f1)
42
+ tm[f1][v1] = empty2.call
43
+
44
+ relationships.each do |relationship|
45
+ tm[f1][v1][relationship] = empty.call
46
+
47
+ features.each do |f2|
48
+ relatives = target.send(relationship)
49
+ relatives = [relatives] unless relatives.is_a? Array
50
+ relatives.each do |relative|
51
+ unless relative.nil?
52
+ next if relative.nil? || !relative.has?(f2)
53
+ v2 = relative.send(f2)
54
+ tm[f1][v1][relationship][f2][v2] ||= 0.0
55
+ tm[f1][v1][relationship][f2][v2] += 1.0
56
+ end
57
+ end
58
+
59
+ tm[f1][v1][:edge] = empty.call
60
+
61
+ target.edges.each do |id, edge_type|
62
+ s = target.ancestor_with_type :sentence
63
+ if s
64
+ x = s.find(id)
65
+ next unless relative.has?(f2)
66
+ v2 = x.send(f2)
67
+ tm[f1][v1][:edge][f2][v2] ||= 0.0
68
+ tm[f1][v1][:edge][f2][v2] += 1.0
69
+ end
70
+ end
71
+
72
+ end
73
+ end
74
+ end
75
+
76
+ mm = mm ? mm.merge(tm, &merger) : tm
77
+ end
78
+ if normalize
79
+ normalize(mm)
80
+ else
81
+ mm
82
+ end
83
+ end
84
+
85
+ # Normalize the transition probabilities.
86
+ def self.normalize(tm)
87
+ tm.each do |f1, as|
88
+ as.each do |a, dirs|
89
+ dirs.each do |dir, f2s|
90
+ f2s.each do |f2, vals|
91
+ sum = vals.values.inject(0) {|n,x| n+x }.to_f
92
+ vals.each do |val, count|
93
+ vals[val] = count/sum
94
+ end
95
+ end
96
+ end
97
+ end
98
+ end
99
+ tm
100
+ end
101
+
102
+ end
103
+ end
104
+ end
105
+ end