DRMacIver-term-extractor 0.0.0

Sign up to get free protection for your applications and to get access to all the features.
data/LICENSE ADDED
@@ -0,0 +1,25 @@
1
+ Copyright (c) 2009, Trampoline Systems
2
+ All rights reserved.
3
+
4
+ Redistribution and use in source and binary forms, with or without
5
+ modification, are permitted provided that the following conditions are met:
6
+ * Redistributions of source code must retain the above copyright
7
+ notice, this list of conditions and the following disclaimer.
8
+ * Redistributions in binary form must reproduce the above copyright
9
+ notice, this list of conditions and the following disclaimer in the
10
+ documentation and/or other materials provided with the distribution.
11
+ * Neither the name of Trampoline Systems nor the
12
+ names of its contributors may be used to endorse or promote products
13
+ derived from this software without specific prior written permission.
14
+
15
+ THIS SOFTWARE IS PROVIDED BY Trampoline Systems ''AS IS'' AND ANY
16
+ EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
17
+ WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
18
+ DISCLAIMED. IN NO EVENT SHALL <copyright holder> BE LIABLE FOR ANY
19
+ DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
20
+ (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
21
+ LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
22
+ ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
23
+ (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
24
+ SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
25
+
data/README.markdown ADDED
@@ -0,0 +1,40 @@
1
+ # The Trampoline Systems term extractor
2
+
3
+ The term extractor is a library for taking natural text and extracting a
4
+ set of terms from it which make sense without additional context. For example, feeding it the following text from my home page:
5
+
6
+ Hi. I’m David.
7
+
8
+ I’m also various other things. By training I’m a mathematician,
9
+ but I seem to have drifted away from that and become a programmer,
10
+ currently working on natural language processing and social analytic
11
+ software at Trampoline Systems.
12
+
13
+ This site is my public face on the internet. It contains my blog,
14
+ my OpenID and anything else I want to share with the world.
15
+
16
+ We get the following terms:
17
+
18
+ David
19
+ training
20
+ mathematician
21
+ programmer
22
+ natural language processing
23
+ social analytic software
24
+ Trampoline Systems
25
+ site
26
+ public face
27
+ public face on the internet
28
+ internet
29
+ blog
30
+ world
31
+
32
+ No attempt is made to assign meaning to the terms: They're not guaranteed to represent the content of the document. They're just intended to be coherent snippets of text which you can reuse in a broader context.
33
+
34
+ One limitation of this is that it doesn't necessarily extract all reasonable terms. For example "natural language" is a reasonable term for this text which is not included in this. The way we use the term extractor at trampoline is to build a vocabulary of terms we consider interesting and then performing literal string searching for this term - this allows us to be selective in what terms we generate and permissive in looking for matches for them.
35
+
36
+ Currently only english is supported. There are plans to support other languages, but nothing is implemented in that regard: It requires someone who is native to that language, a competent programmer and at least passingly familiar with NLP, so understandably we're a bit resource constrained on getting wide spread non-english support.
37
+
38
+ ## Copyright
39
+
40
+ Copyright (c) 2009 Trampoline Systems. See LICENSE for details.
data/Rakefile ADDED
@@ -0,0 +1,56 @@
1
+ require 'rubygems'
2
+ require 'rake'
3
+ require 'spec/rake/spectask'
4
+
5
+ begin
6
+ require 'jeweler'
7
+ Jeweler::Tasks.new do |gem|
8
+ gem.name = "term-extractor"
9
+ gem.summary = %Q{A library for extracting useful terms from text}
10
+ gem.email = "david.maciver@gmail.com"
11
+ gem.homepage = "http://github.com/david.maciver@gmail.com/term-extractor"
12
+ gem.authors = ["David R. MacIver"]
13
+ end
14
+
15
+ rescue LoadError
16
+ puts "Jeweler (or a dependency) not available. Install it with: sudo gem install jeweler"
17
+ end
18
+
19
+ begin
20
+ require 'rcov/rcovtask'
21
+ Rcov::RcovTask.new do |test|
22
+ test.libs << 'test'
23
+ test.pattern = 'test/**/*_test.rb'
24
+ test.verbose = true
25
+ end
26
+ rescue LoadError
27
+ task :rcov do
28
+ abort "RCov is not available. In order to run rcov, you must: sudo gem install spicycode-rcov"
29
+ end
30
+ end
31
+
32
+
33
+ task :default => :test
34
+
35
+ require 'rake/rdoctask'
36
+ Rake::RDocTask.new do |rdoc|
37
+ if File.exist?('VERSION.yml')
38
+ config = YAML.load(File.read('VERSION.yml'))
39
+ version = "#{config[:major]}.#{config[:minor]}.#{config[:patch]}"
40
+ else
41
+ version = ""
42
+ end
43
+
44
+ rdoc.rdoc_dir = 'rdoc'
45
+ rdoc.title = "term-extractor #{version}"
46
+ rdoc.rdoc_files.include('README*')
47
+ rdoc.rdoc_files.include('lib/**/*.rb')
48
+ end
49
+
50
+ Spec::Rake::SpecTask.new do |t|
51
+ t.rcov = false
52
+ t.spec_files = FileList["test/**/*_spec.rb"]
53
+ t.libs << "./lib"
54
+ end
55
+
56
+
data/VERSION ADDED
@@ -0,0 +1 @@
1
+ 0.0.0
data/bin/terms.rb ADDED
@@ -0,0 +1,8 @@
1
+ #!/usr/bin/env jruby
2
+ require "term-extractor"
3
+
4
+ PE = TermExtractor.new
5
+
6
+ PE.nlp.each_sentence(ARGF) do |sentence|
7
+ puts PE.extract_terms_from_sentence(sentence)
8
+ end
@@ -0,0 +1,195 @@
1
+ require "term-extractor/nlp"
2
+
3
+ class Term
4
+ attr_accessor :to_s, :pos, :sentence
5
+
6
+ def initialize(ts, pos, sentence = nil)
7
+ @to_s, @pos, @sentence = ts, pos, sentence
8
+ end
9
+ end
10
+
11
+ class TermExtractor
12
+ attr_accessor :nlp, :max_term_length, :proscribed_start, :required_ending, :remove_urls, :remove_paths
13
+
14
+ def initialize(models = File.dirname(__FILE__) + "/../models")
15
+ @nlp = NLP.new(models)
16
+
17
+ # Empirically, terms longer than about 5 words seem to be either
18
+ # too specific to be useful or very noisy.
19
+ @max_term_length = 5
20
+
21
+ # Common sources of crap starting words
22
+ @proscribed_start = /CC|PRP|IN|DT|PRP\$|WP|WP\$|TO|EX/
23
+
24
+ # We have to end in a noun, foreign word or number.
25
+ @required_ending = /NN|NNS|NNP|NNPS|FW|CD/
26
+
27
+ self.remove_urls = true
28
+ self.remove_paths = true
29
+
30
+ yield self if block_given?
31
+ end
32
+
33
+
34
+ class TermContext
35
+ attr_accessor :parent, :tokens, :postags, :chunks
36
+
37
+ def nlp
38
+ parent.nlp
39
+ end
40
+
41
+ def initialize(parent, sentence)
42
+ @parent = parent
43
+ sentence = NLP.clean_sentence(sentence)
44
+
45
+ # User defineable cleaning.
46
+ sentence = NLP.remove_urls(sentence) if parent.remove_urls
47
+ sentence = NLP.remove_paths(sentence) if parent.remove_paths
48
+
49
+
50
+ @tokens = NLP.tokenize_sentence(sentence)
51
+ @postags = nlp.postagger.tag(tokens)
52
+ @chunks = nlp.chunker.chunk(tokens, postags)
53
+
54
+
55
+ @sentence = sentence
56
+
57
+ end
58
+
59
+ def boundaries
60
+ return @boundaries if @boundaries
61
+
62
+ # To each token we assign three attributes which determine how it may occur within a term.
63
+ # can_cross determines if this token can appear internally in a term
64
+ # can_start determines if a term is allowed to start with this token
65
+ # can_end determines if a term is allowed to end with this token
66
+ @boundaries = tokens.map{|t| {}}
67
+
68
+ @boundaries.each_with_index do |b, i|
69
+ tok = tokens[i]
70
+ pos = postags[i]
71
+ chunk = chunks[i]
72
+
73
+ # Cannot cross commas or coordinating conjections (and, or, etc)
74
+ b[:can_cross] = !(pos =~ /,|CC/)
75
+
76
+ # Cannot cross the beginning of verb terms
77
+ # i.e. we may start with verb terms but not include them
78
+ b[:can_cross] = (chunk != "B-VP") if b[:can_cross]
79
+
80
+ # We generate tags like <PATH>, <URL> and <QUOTE>
81
+ # to encapsulate various sorts of noise strings.
82
+ b[:can_cross] &&= !(tok =~ /<\w+>/)
83
+
84
+ # We are only allowed to start terms on the beginning of a term chunk
85
+ b[:can_start] = (chunks[i] == "B-NP")
86
+ if i > 0
87
+ if postags[i-1] =~ /DT|WDT|PRP|JJR|JJS/
88
+ # In some cases we want to move the start of a term to the right. These cases are:
89
+ # - a determiner (the, a, etc)
90
+ # - a posessive pronoun (my, your, etc)
91
+ # - comparative and superlative adjectives (best, better, etc.)
92
+ # In all cases we only do this for noun terms, and will only move them to internal points.
93
+ b[:can_start] ||= (chunks[i] == "I-NP")
94
+ @boundaries[i - 1][:can_start] = false
95
+ end
96
+ end
97
+
98
+ # We must include any tokens internal to the current chunk
99
+ b[:can_end] = !(chunks[i + 1] =~ /I-/)
100
+
101
+ # It is permitted to cross stopwords, but they cannot lie at the term boundary
102
+ if (nlp.stopword? tok) || (nlp.stopword? tokens[i..i+1].join) # Need to take into account contractions, which span multiple tokens
103
+ b[:can_end] = false
104
+ b[:can_start] = false
105
+ end
106
+
107
+ # The presence of a ' at the start of a token is most likely an indicator that we've
108
+ # split across a contraction. e.g. would've -> would 've. We are not allowed to
109
+ # cross this transition point.
110
+ if tok =~ /^'/
111
+ b[:can_start] = false
112
+ @boundaries[i - 1][:can_end] = false
113
+ end
114
+
115
+ # Must match the requirements for POSes at the beginning and end.
116
+ b[:can_start] &&= !(pos =~ parent.proscribed_start)
117
+ b[:can_end] &&= (pos =~ parent.required_ending)
118
+
119
+ end
120
+
121
+ @boundaries
122
+ end
123
+
124
+ def terms
125
+ return @terms if @terms
126
+
127
+ @terms = []
128
+
129
+ i = 0
130
+ j = 0
131
+ while(i < tokens.length)
132
+ if !boundaries[i][:can_start] || !boundaries[i][:can_cross]
133
+ i += 1
134
+ next
135
+ end
136
+
137
+ j = i if j < i
138
+
139
+ if (j == tokens.length) || !boundaries[j][:can_cross] || (j >= i + parent.max_term_length)
140
+ i += 1
141
+ j = i
142
+ next
143
+ end
144
+
145
+ if !boundaries[j][:can_end]
146
+ j += 1
147
+ next
148
+ end
149
+
150
+ term = tokens[i..j]
151
+ poses = postags.to_a[i..j]
152
+ term = Term.new(TermExtractor.recombobulate_term(term), poses.join("-"))
153
+ terms << term if TermExtractor.allowed_term?(term)
154
+
155
+ j += 1
156
+ end
157
+
158
+ @terms
159
+ end
160
+ end
161
+
162
+ # Extract all terms in a given sentence.
163
+ def extract_terms_from_sentence(sentence)
164
+ TermContext.new(self, sentence).terms
165
+ end
166
+
167
+ def extract_terms_from_text(text)
168
+ if block_given?
169
+ nlp.sentences(text).each_with_index do |s, i|
170
+ terms = extract_terms_from_sentence(s);
171
+ terms.each{|p| p.sentence = i; yield(p) }
172
+ end
173
+ else
174
+ results = []
175
+ extract_terms_from_text(text){ |p| results << p }
176
+ results
177
+ end
178
+ end
179
+
180
+ # Final post filter on terms to determine if they're allowed.
181
+ def self.allowed_term?(p)
182
+ return false if p.pos =~ /^CD(-CD)*$/ # We don't allow things which are just sequences of numbers
183
+ return false if p.to_s.length > 255
184
+ true
185
+ end
186
+
187
+ # Take a sequence of tokens and turn them back into a term.
188
+ def self.recombobulate_term(term)
189
+ term = term.join(" ")
190
+ term.gsub!(/ '/, "'")
191
+ term.gsub!(/ \./, ".")
192
+ term
193
+ end
194
+
195
+ end
Binary file
@@ -0,0 +1,262 @@
1
+ require "fileutils"
2
+ require "java"
3
+ require "term-extractor/opennlp-tools"
4
+ require "term-extractor/maxent-2.5.2"
5
+ require "term-extractor/trove"
6
+ require "term-extractor/snowball"
7
+ require "set"
8
+
9
+ class TermExtractor
10
+ # NLP contains a lot of general NLP related utilities.
11
+ # In particular it contains:
12
+ # - a selection of OpenNLP classes
13
+ # - a snowball stemmer
14
+ # - a stopword list
15
+ #
16
+ # And various utilities built on top of these.
17
+ class NLP
18
+ JV = Java::OpennlpToolsLangEnglish
19
+ include_class("org.tartarus.snowball.ext.englishStemmer") { |x, y| "EnglishStemmer" }
20
+
21
+ def stem(word)
22
+ stemmer.setCurrent(word)
23
+ stemmer.stem
24
+ stemmer.getCurrent
25
+ end
26
+
27
+ def sentdetect
28
+ @sentdetect ||= JV::SentenceDetector.new(loc("sd.bin.gz"))
29
+ end
30
+
31
+ def tagdict
32
+ @tagdict ||= Java::OpennlpToolsPostag::POSDictionary.new(loc("tagdict"), true)
33
+ end
34
+
35
+ def postagger
36
+ @postagger ||= JV::PosTagger.new(loc("tag.bin.gz"), tagdict)
37
+ end
38
+
39
+ def chunker
40
+ @chunker ||= JV::TreebankChunker.new(loc("chunk.bin.gz"))
41
+ end
42
+
43
+ def stopwords
44
+ @stopwords
45
+ end
46
+
47
+ def stemmer
48
+ @stemmer ||= EnglishStemmer.new
49
+ end
50
+
51
+
52
+ def initialize(models)
53
+ @models = models
54
+ @stopwords = Set.new
55
+
56
+ File.open(loc("stopwords")).each_line do |l|
57
+ l.gsub!(/#.+$/, "")
58
+ @stopwords.add clean_for_stopword(l)
59
+ end
60
+ end
61
+
62
+ # Canonicalisation gives a string that in some sense captures the "essential character"
63
+ # of a piece of text. It normalizes it by removing unneccessary words, rearranging, and
64
+ # stripping suffixes.
65
+ # It is not itself intended to be a useful representation of the string, but instead for
66
+ # determining if two strings are equal.
67
+ def canonicalize(str)
68
+ str.
69
+ to_s.
70
+ downcase.
71
+ gsub(/[^\w\s]/, " ").
72
+ split.
73
+ select{|p| !stopword?(p)}.
74
+ map{|p| stem(p) }.
75
+ sort.
76
+ join(" ")
77
+ end
78
+
79
+ def stopword?(word)
80
+ stopwords.include?(clean_for_stopword(word))
81
+ end
82
+
83
+ # Once we have split sentences, we clean them up prior to tokenization. We remove or normalize
84
+ # a bunch of noise sources and get it to a form where distinct tokens are separated by whitespace.
85
+ def NLP.clean_sentence(text)
86
+ text = text.dup
87
+ text.gsub!(/--+/, " -- ") # TODO: What's this for?
88
+
89
+ # Normalize bracket types.
90
+ # TODO: Shouldn't do this inside of tokens.
91
+ text.gsub!(/{\[/, "(")
92
+ text.gsub!(/\}\]/, ")")
93
+
94
+ # We turn most forms of punctuation which are not internal to tokens into commas
95
+ punct = /(\"|\(|\)|;|-|\:|-|\*|,)/
96
+
97
+ # Convert cunning "smart" apostrophes into plain old boring
98
+ # dumb ones.
99
+ text.gsub!(/’/, "'")
100
+
101
+ text.gsub!(/([\w])\.\.+([\w])/){ "#{$1} , #{$2}"}
102
+ text.gsub!(/(^| )#{punct}+/, " , ")
103
+ text.gsub!(/#{punct}( |$)/, " , ")
104
+ text.gsub!(/(\.+ |')/){" #{$1}"}
105
+
106
+ separators = /\//
107
+
108
+ text.gsub!(/ #{separators} /, " , ")
109
+
110
+ # We can be a bit overeager in turning things into commas, so we clear them up here
111
+ # In particular we remove any we've accidentally added to the end of lines and we collapse
112
+ # consecutive ones into a single one.
113
+ text.gsub!(/(,|\.) *,/){ " #{$1} " }
114
+ text.gsub!(/(,| )+$/, "")
115
+ text.gsub!(/^(,| )+/, "")
116
+
117
+ text.gsub!(/((?:\.|\!|\?)+)$/){" #{$1}" }
118
+
119
+ # Clean up superfluous whitespace
120
+ text.gsub!(/\s+/, " ")
121
+ text
122
+ end
123
+
124
+ def NLP.tokenize_sentence(string)
125
+ clean_sentence(string).split
126
+ end
127
+
128
+ Ending = /(!|\?|\.)+/
129
+
130
+ def self.clean_text(text)
131
+ text = text.gsub(/\r(\n?)/, "\n") # Evil microsoft line endings, die die die!
132
+ text.gsub!(/^\s+$/, "") # For convenience, remove all spaces from blank lines
133
+ text.gsub!(/\n\n+/m, ".\n.\n") # Collapse multiple line endings into periods
134
+ text.gsub!(/\n/, " ") # Squash the text onto a single line.
135
+ text.gsub!(/(\d+)\. /){ "#{$1} . " } # We separate out things of the form 1. as these are commonly lists and OpenNLP sentence detection handles them badly
136
+ text.strip!
137
+ text
138
+ end
139
+
140
+ def self.remove_urls(text)
141
+ text.gsub(/\w+:\/\/[^\s]+?(?=\.?(?= |$))/, "<URL>")
142
+ end
143
+
144
+ def self.remove_paths(text)
145
+ text = text.clone
146
+
147
+ # Fragments of windows paths
148
+ text.gsub!(/[\w:\\]*\\[\w:\\]*/, "<PATH>")
149
+
150
+ # fragments of unix paths
151
+ text.gsub!(/\/[\w\/]+/, "<PATH>")
152
+ text.gsub!(/[\w\/]+\//, "<PATH>")
153
+
154
+ while text.gsub!(/<PATH>\s+\w+\s+<PATH>/, "<PATH>")
155
+ # concatenate fragments where we have e.g. <PATH> and <PATH>
156
+ # into single paths. This is to take into account paths containing spaces.
157
+ end
158
+
159
+ text.gsub!(/<PATH>(\s*<PATH)*/, "<PATH>")
160
+ text
161
+ end
162
+
163
+ EmbedBoundaries = [
164
+ ["\"", "\""],
165
+ ["(", ")"],
166
+ ["[", "]"],
167
+ ["{", "}"]
168
+ ].map{|s| s.map{|x| Regexp.quote(x) }}
169
+
170
+ # Normalise a sentence by removing all parenthetical comments and replacing all embedded quotes contained therein
171
+ # Return an array of the sentence and all contained subterms
172
+ def self.extract_embedded_sentences(text)
173
+ text = text.clone
174
+ fragments = [text]
175
+
176
+ l = nil
177
+ begin
178
+ l = fragments.length
179
+
180
+ EmbedBoundaries.each do |s, e|
181
+ replace = if s == e then "<QUOTE>" else "" end
182
+ matcher = /#{s}[^#{s}#{e}\n]*#{e}/
183
+ text.gsub!(matcher) { |frag| fragments << frag[1..-2]; replace }
184
+ end
185
+
186
+ end while fragments.length > l
187
+
188
+ if fragments.length > 1
189
+ fragments = fragments.map{|f| extract_embedded_sentences(f) }.flatten
190
+ end
191
+
192
+ fragments
193
+ end
194
+
195
+ def sentences(string)
196
+ sentdetect.sentDetect(NLP.clean_text(string)).to_a.map{|s| s.strip }.select{|s| (s.length > 0) && !(s =~ /^(\.|!|\?)+$/) }
197
+ end
198
+
199
+ def each_sentence(source)
200
+ lines = []
201
+
202
+ process_lines = lambda{
203
+ text = lines.join("\n").strip
204
+ if text != ""
205
+ sentences(text).each{|s| yield(s.gsub("\n", " ")) }
206
+ end
207
+ lines = []
208
+ }
209
+
210
+ source.each_line do |line|
211
+ line = line.strip
212
+
213
+ if line == ""
214
+ process_lines.call
215
+ end
216
+
217
+ lines << line
218
+ end
219
+
220
+ process_lines.call
221
+ end
222
+
223
+ def postag(tokens)
224
+ if tokens.is_a? String
225
+ tokens = NLP.tokenize_sentence(tokens)
226
+ else
227
+ tokens = tokens.to_a
228
+ end
229
+ tokens.zip(postagger.tag(tokens).to_a)
230
+ end
231
+
232
+ def chunk_text(text)
233
+ result = []
234
+ sentences(text).each{|x| result += chunk_sentence(x)}
235
+ result
236
+ end
237
+
238
+ def chunk_sentence(sentence)
239
+ tokens = NLP.tokenize_sentence(sentence)
240
+ postags = postagger.tag(tokens)
241
+ tokens.zip(chunker.chunk(tokens, postags).to_a)
242
+ end
243
+
244
+ private
245
+ def loc(file)
246
+ File.join(@models, file)
247
+ end
248
+
249
+ def clean_for_stopword(word)
250
+ word.downcase.gsub(/[^\w]/, "")
251
+ end
252
+
253
+ def chunk_type(tag)
254
+ case tag
255
+ when "O"
256
+ "O"
257
+ when /B-(.+)$/
258
+ $1
259
+ end
260
+ end
261
+ end
262
+ end