cass 0.0.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,55 @@
1
+ module Cass
2
+
3
+ # Defines a single contrast on a document or documents.
4
+ # Currently, only comparisons between two pairs of words (i.e., computation of
5
+ # the interaction term; see Holtzman et al., submitted for details) is implemented.
6
+ class Contrast
7
+
8
+ attr_accessor :words
9
+
10
+ def initialize(words)
11
+ @words = words
12
+ end
13
+
14
+ # Initialize a contrast from a string representation.
15
+ def self.parse(contrast)
16
+ words = contrast.split(/,*\s+/)
17
+ words = [words[0,2], words[2,2]] if (words.size == 4)
18
+ Contrast.new(words)
19
+ end
20
+
21
+ # Apply the contrast to a document and return result as a string.
22
+ # See to_s method for format.
23
+ def apply(doc)
24
+ sim = doc.similarity(@words.flatten)
25
+ if sim.class == Array
26
+ puts "Error: #{doc.name} is missing the following operands: #{sim.join(", ")}"
27
+ return false
28
+ end
29
+ # Compute interaction term
30
+ if @words.flatten.size == 4
31
+ s = [doc.name, sim[0,2], sim[0,3], sim[1,2], sim[1,3]]
32
+ s[1,4] = s[1,4].map { |f| format("%0.4f", f).to_f }
33
+ s << s[1] - s[2] - s[3] + s[4]
34
+ s.join("\t")
35
+ else
36
+ [doc.name, sim[0,1]].join("\t")
37
+ end
38
+ end
39
+
40
+ # Returns a string representation of the contrast.
41
+ # For interaction terms, returns five columns (the four pairs
42
+ # and the interaction term).
43
+ # For pairwise contrasts, just returns the similarity.
44
+ def to_s
45
+ if @words.flatten.size == 4
46
+ o = @words.flatten
47
+ "document\t#{o[0]}.#{o[2]}\t#{o[0]}.#{o[3]}\t#{o[1]}.#{o[2]}\t#{o[1]}.#{o[3]}\tinteraction"
48
+ else
49
+ "document\t#{@words.join(".")}"
50
+ end
51
+ end
52
+
53
+ end
54
+
55
+ end
@@ -0,0 +1,233 @@
1
+ module Cass
2
+
3
+ # A Document object represents a single document--
4
+ # can be either an entire file, or a subset.
5
+ class Document
6
+
7
+ attr_accessor :name, :targets, :text, :lines, :clines, :context, :tindex, :unique
8
+
9
+ # Create a new Document. Three arguments are required:
10
+ # * name: The name of the document (defaults to the filename)
11
+ # * targets: Either an array of target words, or an array of Contrasts from which targets will be extracted
12
+ # * text: A string of text (the contents of the document)
13
+ #
14
+ # The following (optional) settings can be passed in an options hash as the fourth argument:
15
+ # * context: A Context object to use (by default, a new one will be constructed)
16
+ # * skip_preproc: Skip most text preprocessing steps. This should only ever be used when creating a document derived from another document, where the text has already been processed.
17
+ # * max_lines: Maximum number of lines to use from the provided text. Note that this limit applies to the number of lines in the input text, NOT the number retained for analysis. By default, will use all lines.
18
+ # * recode: a hash containing words to recode in the text prior to analysis. For instance, if the key=>value pair 'liberal'=>'democrat' is passed, all occurrences of 'liberal' will be replaced with 'democrat'. This is useful when you want to analyze several words together as a single category, or for combining singular and plural forms of a word.
19
+ # * keep_case: By default, all words will be converted to lowercase. Passing this key in the options hash will preserve case in the text. Note that this will cause different cases of the same word to be handled as different words.
20
+ # * keep_special: By default, all non-alphabetical characters will be removed. Use this flag if you want special characters to be retained, with the same caveat as for keep_case.
21
+ # * parse_text: By default, it's assumed that the text is already broken up into sentences at desired boundaries (one sentence per line). If the parse_text key is passed, a parser will be called. Note that USING THIS OPTION IS NOT RECOMMENDED. You should generally preprocess the input text yourself to ensure it looks right before submitting it to analysis.
22
+ # * parser_basic: If parse_text is on, the Parser will try to call the Stanford Parser by default. If the Stanford Parser isn't installed properly, a backup basic parser will be used. Including the parser_basic flag will automatically revert to the basic parser instead of attempting to use the Stanford parser.
23
+ # * parser_regex: A custom regular expression that will be handed to the basic parser. Lines will be split at matches to the regex instead of the default (splitting only at newlines and periods). Note that parser_basic and parse_text must both be passed for this to work.
24
+ def initialize(name, targets, text, opts={})
25
+
26
+ # Error checking...
27
+ if name.nil?
28
+ abort("Error: document has no name!")
29
+ elsif targets.nil? or targets.class != Array or targets.empty?
30
+ abort("Error: invalid target specification; targets must be an array of words or Contrasts.")
31
+ elsif text.nil?
32
+ abort("Error: no text provided!")
33
+ end
34
+
35
+ # Set/initialize instance variables
36
+ @name, @text, @tindex = name, text, {}
37
+
38
+ # Get list of words from contrasts if necessary
39
+ @targets =
40
+ if targets[0].class == Contrast
41
+ targets = contrasts.inject([]) { |t, c| t += c.words.flatten }.uniq
42
+ else
43
+ targets
44
+ end
45
+
46
+ # Index targets, parse text, and create Context
47
+ @targets.each_index { |i| @tindex[@targets[i]] = i }
48
+ parse(opts)
49
+ @context = context.nil? ? Context.new(self, opts) : @context
50
+ end
51
+
52
+ # Parse raw text into sentences. When orig == false
53
+ # (e.g., when bootstrapping or splitting a document),
54
+ # skip certain preprocessing steps, on the assumption that
55
+ # these have already been performed.
56
+ def parse(opts={})
57
+ if opts.key?('skip_preproc')
58
+ @lines = (text.class == Array) ? @text : text.split(/[\r\n]+/)
59
+ else
60
+ puts "Converting to lowercase..." if VERBOSE
61
+ @text.downcase! unless opts.key?('keep_case')
62
+ @text.gsub!(/[^a-z \n]+/, '') unless opts.key('keep_special')
63
+ if opts.key?('recode')
64
+ puts "Recoding words..." if VERBOSE
65
+ opts['recode'].each { |k,v| @text.gsub!(/(^|\s+)(#{k})($|\s+)/, "\\1#{v}\\3") }
66
+ end
67
+ puts "Parsing text..." if VERBOSE
68
+ @lines = opts.key?('parse_text') ? Parser.parse(@text, opts) : @text.split(/[\r\n]+/)
69
+ @lines = @lines[0, opts['max_lines']] if opts.key?('max_lines')
70
+ trim!
71
+ end
72
+ end
73
+
74
+ # Trim internal list of lines, keeping only those that contain
75
+ # at least one target word.
76
+ def trim!
77
+ puts "Deleting target-less lines..." if VERBOSE
78
+ ts = @targets.join("|")
79
+ #@lines.delete_if { |s| (s.split(/\s+/) & @targets).empty? } # another way to do it
80
+ nl = @lines.size
81
+ @lines = @lines.grep(/(^|\s+)(#{ts})($|\s+)/)
82
+ puts "Keeping #{@lines.size} / #{nl} lines." if VERBOSE
83
+ self
84
+ end
85
+
86
+ # Randomly reorder lines of text.
87
+ # If clines is true, use the compacted lines variable, otherwise use all lines.
88
+ def permute(clines=false)
89
+ clines ? @clines.sort_by {rand} : @lines.sort_by {rand}
90
+ end
91
+
92
+ # Same as permute, but replaces contents of current document.
93
+ def permute!(clines=false)
94
+ self.instance_variable_set("#{clines ? 'clines' : 'lines'}", permute(clines))
95
+ self
96
+ end
97
+
98
+ # Resample n lines WITH replacement from text (for bootstrapping).
99
+ # n = number of lines to resample; defaults to full size of corpus.
100
+ # If clines is true, use the compacted lines variable, otherwise use all lines.
101
+ def resample(clines=false, n=nil)
102
+ n = @lines.size if n.nil? or n > @lines.size
103
+ max = @lines.size
104
+ Array.new(n).map { |i| clines ? @clines[rand(max)] : @lines[rand(max)] }
105
+ end
106
+
107
+ # Same as resample, but replaces contents of current document.
108
+ def resample!(clines=false, n=nil)
109
+ self.instance_variable_set("#{clines ? 'clines' : 'lines'}", resample(clines,n))
110
+ self
111
+ end
112
+
113
+ # Split Document into n smaller random subsets,
114
+ # recalculating the context each time. Currently not used.
115
+ def split(n=10, recalc=true)
116
+ permute!
117
+ docs = []
118
+ n.times { |i|
119
+ text = @lines.slice!(0,(@lines.size.to_f/(n-i)).round)
120
+ context = recalc ? nil : @context
121
+ name = "#{@name}_split_#{(i+1)}"
122
+ docs[i] = Document.new(name, @targets, text, context, false)
123
+ }
124
+ docs
125
+ end
126
+
127
+ # Bootstrap a distribution of n documents,
128
+ # recalculating the context each time. Note that this is currently
129
+ # not used anywhere because all the work is done in the Analysis class.
130
+ # def bootstrap(n=10, recalc=true)
131
+ # permute!
132
+ # docs = [self]
133
+ # n.times { |i|
134
+ # puts "Generating bootstrap ##{i+1}..." if VERBOSE
135
+ # d = self.clone
136
+ # d.name = "#{@name}_boot_#{(i+1)}"
137
+ # d.resample!
138
+ # d.context = Context.new(d) if recalc
139
+ # d.text = nil # Don't need to save this again, save time by deleting...
140
+ # docs << d
141
+ # }
142
+ # docs
143
+ # end
144
+
145
+ # Drop all words that aren't in target list or context. Store as an array of arrays,
146
+ # with first element = array of targets and second = array of context words.
147
+ def compact
148
+ puts "Compacting all lines..." if VERBOSE
149
+ @clines = []
150
+ @lines.each { |l|
151
+ w = l.split(/\s+/).uniq
152
+ targs = w.select { |s| @tindex.key?(s) }
153
+ conts = w.delete_if { |s| !@context.key?(s) }
154
+ @clines << [targs, conts]
155
+ }
156
+ end
157
+
158
+ # Computes co-occurrence matrix between target words and the context.
159
+ # Stores a target-by-context integer matrix internally.
160
+ def cooccurrence(normalize_weights=false)
161
+ # puts "Generating co-occurrence matrix..." if VERBOSE
162
+ coocc = NMatrix.float(@targets.size, @context.size)
163
+ compact if @clines.nil?
164
+ lc = 0 # line counter
165
+ @clines.each { |l|
166
+ targs, conts = l
167
+ targs.each { |t|
168
+ conts.each { |c|
169
+ next if t == c
170
+ incr = normalize_weights ? 1.0/conts.size : 1
171
+ coocc[@tindex[t], @context[c]] = coocc[@tindex[t], @context[c]].to_f + incr
172
+ }
173
+ }
174
+ }
175
+ @cooc = coocc #.to_f
176
+ @corr = @cooc.corr#.collect { |i| i*i } # Uncomment second half of line to square coefficients.
177
+ #p @corr.to_a
178
+ self
179
+ end
180
+
181
+ # Return the requested subset of the similarity matrix.
182
+ # E.g., given the input ['apple', 'orange', 'elephant'],
183
+ # a 3 x 3 pairwise similarity matrix will be returned.
184
+ def similarity(words)
185
+ ind = @tindex.values_at(*words)
186
+ @corr[ind, ind]
187
+ end
188
+
189
+ # Computes the pairwise similarity between all possible target pairs
190
+ # and saves teh results to the specified file. Note that this will
191
+ # produce an unmanageably large file if the number of
192
+ # targets is very large!
193
+ # The returned string contains tab-delimited columns for:
194
+ # * Document name
195
+ # * First word in pair
196
+ # * Second word in pair
197
+ # * Similarity value (correlation)
198
+ def pairwise_similarity(filename)
199
+ abort("Error: you must compute the similarity matrix first!") if @corr.nil?
200
+ outf = File.new(filename, 'w')
201
+ outf.sync = true
202
+ ind = @tindex.invert # For looking up words
203
+ outf.puts %w[Document Word1 Word2 Correlation].join("\t")
204
+ (dim = @corr.shape[0]).times { |i|
205
+ i.upto(dim-1) { |j| outf.puts [@name, ind[i], ind[j], @corr[i,j]] }
206
+ }
207
+ end
208
+
209
+ # Print out summary information about the document.
210
+ # Optional arguments:
211
+ # * filename: if not nil, will save the results to location provided instead of printing.
212
+ # * list_context: if true, will print out the entire list (typically several thousand words) of words in the context.
213
+ # * word_count: print list of number of tokens in the document for each word, by descending frequency rank. Targets will be printed first. Note that the token counts reflect only the lines used in analysis (i.e., those that contain at least one target), and NOT the entire document. If you need word counts for an entire document, call Stats.word_count directly.
214
+ def summary(filename=nil, list_context=false, word_count=false)
215
+
216
+ buffer = []
217
+
218
+ # Basic info that always gets shown
219
+ buffer << "Summary for document '#{@name}':"
220
+ buffer << "#{@targets.size} target words (#{@targets.join (", ")})"
221
+ buffer << "#{@context.words.size} words in context."
222
+ buffer << "Using #{@clines.size} lines (containing at least one target word) for analysis."
223
+
224
+ # Other options
225
+ buffer << "\n\nList of all words in context:#{@context.words}" if list_context
226
+ buffer << "\n\nNumber of tokens for all words included in analysis:\n#{Stats.word_count(@clines)}" if word_count
227
+
228
+ filename.nil? ? puts(buffer) : File.new(filename, 'w').puts(buffer)
229
+
230
+ end
231
+ end
232
+
233
+ end
@@ -0,0 +1,16 @@
1
+ # Added functionality in NArray matrix class.
2
+ class NMatrix
3
+
4
+ # Compute and return the column-wise correlation matrix for an NMatrix.
5
+ def corr
6
+ n = self.shape[0]
7
+ n.times { |i|
8
+ col = self[i,true]
9
+ sd = col.stddev
10
+ self[i,true] = ((col.sbt!(col.mean))/sd).to_a.flatten
11
+ }
12
+ #p self.to_a
13
+ (self.transpose*self)/(self.shape[1]-1).to_f
14
+ end
15
+
16
+ end
@@ -0,0 +1,43 @@
1
+ module Cass
2
+
3
+ # Parses a string (e.g., text read from a file) into sentences.
4
+ # Can use either the Stanford Natural Language Parser (if installed),
5
+ # or a barebones parser that splits text at line breaks and periods.
6
+ # Generally speaking, you shouldn't rely on the Parser class to
7
+ # parse and sanitize your input texts for you. This class implements
8
+ # only barebones functionality, and there's no guarantee the resulting
9
+ # text will look the way you want it. You are strongly encouraged to
10
+ # process all texts yourself beforehand, and use this functionality
11
+ # only as a last resort.
12
+ class Parser
13
+
14
+ # Parses a string into sentences.If the Stanford Parser and
15
+ # associated Ruby gem are installed (http://stanfordparser.rubyforge.org/),
16
+ # they will be called to do the job. If not, only basic parsing
17
+ # will be performed: text will be split into sentences at newlines
18
+ # and periods. Note that this is suboptimal and may generate problems
19
+ # for some documents.
20
+ def self.parse(text, opts={})
21
+ # Try to load Stanford Parser wrapper
22
+ begin
23
+ require 'stanfordparser'
24
+ rescue LoadError
25
+ puts "Error: stanfordparser gem couldn't load. Using barebones parsing mode instead. If you'd like to use" +
26
+ " the Stanford Parser, make sure all required components are installed (see http://stanfordparser.rubyforge.org/). You'll need to make sure the java library is installed, as well as the treebank and jrb gems."
27
+ spfail = true
28
+ end
29
+
30
+ if spfail or opts.key?('parser_basic')
31
+ puts "Using a basic parser to split text into sentences. Note that this is intended as a last resort only; you are strongly encouraged to process all input texts yourself and make sure that lines are broken up the way you want them to be (with each line on a new line of text in the file). If you use this parser, we make no guarantees about the quality of the output."
32
+ rx = opts.key?('parser_regex') ? opts['parser_regex'] : "[\r\n\.]+"
33
+ text.split(/#{rx}/)
34
+ else
35
+ puts "Using the Stanford Parser to parse the text. Note that this could take a long time for large files!" if VERBOSE
36
+ parser = StanfordParser::DocumentPreprocessor.new
37
+ parser.getSentencesFromString(text)
38
+ end
39
+ end
40
+
41
+ end
42
+
43
+ end
data/lib/cass/stats.rb ADDED
@@ -0,0 +1,40 @@
1
+ module Cass
2
+
3
+ # Collects miscellaneous descriptive statistic methods that
4
+ # may be useful. These are generally not hooked up to the
5
+ # primary processing stream, and need to be called on an
6
+ # ad-hoc basis.
7
+ class Stats
8
+
9
+ # Takes a string as input and prints out a list of all words encountered,
10
+ # sorted by their frequency count (in descending order).
11
+ # Words are separated by whitespace; no additional processing will be performed,
12
+ # so if you don't want special characters to define words, you need to preprocess
13
+ # the string before you call this method.
14
+ # Arguments:
15
+ # * text: the string to count token occurrences in.
16
+ # * stopwords: optional location of stopword file. Words in file will be excluded from count.
17
+ # * save: the filename to save the results to. If left nil, will print to screen.
18
+ def self.word_count(text, stopwords=nil, save=nil)
19
+ sw = {}
20
+ text = text.join(" ") if text.class == Array
21
+ File.new(stopwords).readlines.each { |l| sw[l.strip] = 1 } if !stopwords.nil?
22
+ words = text.split(/\s+/)
23
+ counts = Hash.new(0)
24
+ words.each { |w| counts[w] += 1 if !sw.key?(w) }
25
+ counts = counts.sort { |a,b| b[1] <=> a[1] }.each { |l| "#{l[0]}: #{l[1]}" }
26
+ if save.nil?
27
+ puts counts
28
+ else
29
+ File.new(save, 'w').puts counts
30
+ end
31
+ end
32
+
33
+ # Count the number of times a given token s occurs in text.
34
+ def self.string_tokens(text, s)
35
+ text.scan(/#{s}/).size
36
+ end
37
+
38
+ end
39
+
40
+ end
metadata ADDED
@@ -0,0 +1,112 @@
1
+ --- !ruby/object:Gem::Specification
2
+ name: cass
3
+ version: !ruby/object:Gem::Version
4
+ hash: 29
5
+ prerelease: false
6
+ segments:
7
+ - 0
8
+ - 0
9
+ - 1
10
+ version: 0.0.1
11
+ platform: ruby
12
+ authors:
13
+ - Tal Yarkoni
14
+ autorequire:
15
+ bindir: bin
16
+ cert_chain: []
17
+
18
+ date: 2010-06-15 00:00:00 -06:00
19
+ default_executable:
20
+ dependencies:
21
+ - !ruby/object:Gem::Dependency
22
+ name: narray
23
+ prerelease: false
24
+ requirement: &id001 !ruby/object:Gem::Requirement
25
+ none: false
26
+ requirements:
27
+ - - ">="
28
+ - !ruby/object:Gem::Version
29
+ hash: 77
30
+ segments:
31
+ - 0
32
+ - 5
33
+ - 9
34
+ - 7
35
+ version: 0.5.9.7
36
+ type: :runtime
37
+ version_requirements: *id001
38
+ description: A set of tools for conducting Contrast Analyses of Semantic Similarity (CASS).
39
+ email: tyarkoni@gmail.com
40
+ executables: []
41
+
42
+ extensions: []
43
+
44
+ extra_rdoc_files:
45
+ - CHANGELOG
46
+ - LICENSE
47
+ - README.rdoc
48
+ - lib/cass.rb
49
+ - lib/cass/analysis.rb
50
+ - lib/cass/context.rb
51
+ - lib/cass/contrast.rb
52
+ - lib/cass/document.rb
53
+ - lib/cass/extensions.rb
54
+ - lib/cass/parser.rb
55
+ - lib/cass/stats.rb
56
+ files:
57
+ - CHANGELOG
58
+ - LICENSE
59
+ - Manifest
60
+ - README.rdoc
61
+ - Rakefile
62
+ - cass.gemspec
63
+ - lib/cass.rb
64
+ - lib/cass/analysis.rb
65
+ - lib/cass/context.rb
66
+ - lib/cass/contrast.rb
67
+ - lib/cass/document.rb
68
+ - lib/cass/extensions.rb
69
+ - lib/cass/parser.rb
70
+ - lib/cass/stats.rb
71
+ has_rdoc: true
72
+ homepage: http://casstools.org
73
+ licenses: []
74
+
75
+ post_install_message:
76
+ rdoc_options:
77
+ - --line-numbers
78
+ - --inline-source
79
+ - --title
80
+ - Cass
81
+ - --main
82
+ - README.rdoc
83
+ require_paths:
84
+ - lib
85
+ required_ruby_version: !ruby/object:Gem::Requirement
86
+ none: false
87
+ requirements:
88
+ - - ">="
89
+ - !ruby/object:Gem::Version
90
+ hash: 3
91
+ segments:
92
+ - 0
93
+ version: "0"
94
+ required_rubygems_version: !ruby/object:Gem::Requirement
95
+ none: false
96
+ requirements:
97
+ - - ">="
98
+ - !ruby/object:Gem::Version
99
+ hash: 11
100
+ segments:
101
+ - 1
102
+ - 2
103
+ version: "1.2"
104
+ requirements: []
105
+
106
+ rubyforge_project: cass
107
+ rubygems_version: 1.3.7
108
+ signing_key:
109
+ specification_version: 3
110
+ summary: A set of tools for conducting Contrast Analyses of Semantic Similarity (CASS).
111
+ test_files: []
112
+