DRMacIver-term-extractor 0.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
data/LICENSE ADDED
@@ -0,0 +1,25 @@
1
+ Copyright (c) 2009, Trampoline Systems
2
+ All rights reserved.
3
+
4
+ Redistribution and use in source and binary forms, with or without
5
+ modification, are permitted provided that the following conditions are met:
6
+ * Redistributions of source code must retain the above copyright
7
+ notice, this list of conditions and the following disclaimer.
8
+ * Redistributions in binary form must reproduce the above copyright
9
+ notice, this list of conditions and the following disclaimer in the
10
+ documentation and/or other materials provided with the distribution.
11
+ * Neither the name of Trampoline Systems nor the
12
+ names of its contributors may be used to endorse or promote products
13
+ derived from this software without specific prior written permission.
14
+
15
+ THIS SOFTWARE IS PROVIDED BY Trampoline Systems ''AS IS'' AND ANY
16
+ EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
17
+ WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
18
+ DISCLAIMED. IN NO EVENT SHALL <copyright holder> BE LIABLE FOR ANY
19
+ DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
20
+ (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
21
+ LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
22
+ ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
23
+ (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
24
+ SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
25
+
data/README.markdown ADDED
@@ -0,0 +1,40 @@
1
+ # The Trampoline Systems term extractor
2
+
3
+ The term extractor is a library for taking natural text and extracting a
4
+ set of terms from it which make sense without additional context. For example, feeding it the following text from my home page:
5
+
6
+ Hi. I’m David.
7
+
8
+ I’m also various other things. By training I’m a mathematician,
9
+ but I seem to have drifted away from that and become a programmer,
10
+ currently working on natural language processing and social analytic
11
+ software at Trampoline Systems.
12
+
13
+ This site is my public face on the internet. It contains my blog,
14
+ my OpenID and anything else I want to share with the world.
15
+
16
+ We get the following terms:
17
+
18
+ David
19
+ training
20
+ mathematician
21
+ programmer
22
+ natural language processing
23
+ social analytic software
24
+ Trampoline Systems
25
+ site
26
+ public face
27
+ public face on the internet
28
+ internet
29
+ blog
30
+ world
31
+
32
+ No attempt is made to assign meaning to the terms: They're not guaranteed to represent the content of the document. They're just intended to be coherent snippets of text which you can reuse in a broader context.
33
+
34
+ One limitation of this is that it doesn't necessarily extract all reasonable terms. For example "natural language" is a reasonable term for this text which is not included in this. The way we use the term extractor at trampoline is to build a vocabulary of terms we consider interesting and then performing literal string searching for this term - this allows us to be selective in what terms we generate and permissive in looking for matches for them.
35
+
36
+ Currently only english is supported. There are plans to support other languages, but nothing is implemented in that regard: It requires someone who is native to that language, a competent programmer and at least passingly familiar with NLP, so understandably we're a bit resource constrained on getting wide spread non-english support.
37
+
38
+ ## Copyright
39
+
40
+ Copyright (c) 2009 Trampoline Systems. See LICENSE for details.
data/Rakefile ADDED
@@ -0,0 +1,56 @@
1
+ require 'rubygems'
2
+ require 'rake'
3
+ require 'spec/rake/spectask'
4
+
5
+ begin
6
+ require 'jeweler'
7
+ Jeweler::Tasks.new do |gem|
8
+ gem.name = "term-extractor"
9
+ gem.summary = %Q{A library for extracting useful terms from text}
10
+ gem.email = "david.maciver@gmail.com"
11
+ gem.homepage = "http://github.com/david.maciver@gmail.com/term-extractor"
12
+ gem.authors = ["David R. MacIver"]
13
+ end
14
+
15
+ rescue LoadError
16
+ puts "Jeweler (or a dependency) not available. Install it with: sudo gem install jeweler"
17
+ end
18
+
19
+ begin
20
+ require 'rcov/rcovtask'
21
+ Rcov::RcovTask.new do |test|
22
+ test.libs << 'test'
23
+ test.pattern = 'test/**/*_test.rb'
24
+ test.verbose = true
25
+ end
26
+ rescue LoadError
27
+ task :rcov do
28
+ abort "RCov is not available. In order to run rcov, you must: sudo gem install spicycode-rcov"
29
+ end
30
+ end
31
+
32
+
33
+ task :default => :test
34
+
35
+ require 'rake/rdoctask'
36
+ Rake::RDocTask.new do |rdoc|
37
+ if File.exist?('VERSION.yml')
38
+ config = YAML.load(File.read('VERSION.yml'))
39
+ version = "#{config[:major]}.#{config[:minor]}.#{config[:patch]}"
40
+ else
41
+ version = ""
42
+ end
43
+
44
+ rdoc.rdoc_dir = 'rdoc'
45
+ rdoc.title = "term-extractor #{version}"
46
+ rdoc.rdoc_files.include('README*')
47
+ rdoc.rdoc_files.include('lib/**/*.rb')
48
+ end
49
+
50
+ Spec::Rake::SpecTask.new do |t|
51
+ t.rcov = false
52
+ t.spec_files = FileList["test/**/*_spec.rb"]
53
+ t.libs << "./lib"
54
+ end
55
+
56
+
data/VERSION ADDED
@@ -0,0 +1 @@
1
+ 0.0.0
data/bin/terms.rb ADDED
@@ -0,0 +1,8 @@
1
+ #!/usr/bin/env jruby
2
+ require "term-extractor"
3
+
4
+ PE = TermExtractor.new
5
+
6
+ PE.nlp.each_sentence(ARGF) do |sentence|
7
+ puts PE.extract_terms_from_sentence(sentence)
8
+ end
@@ -0,0 +1,195 @@
1
+ require "term-extractor/nlp"
2
+
3
+ class Term
4
+ attr_accessor :to_s, :pos, :sentence
5
+
6
+ def initialize(ts, pos, sentence = nil)
7
+ @to_s, @pos, @sentence = ts, pos, sentence
8
+ end
9
+ end
10
+
11
+ class TermExtractor
12
+ attr_accessor :nlp, :max_term_length, :proscribed_start, :required_ending, :remove_urls, :remove_paths
13
+
14
+ def initialize(models = File.dirname(__FILE__) + "/../models")
15
+ @nlp = NLP.new(models)
16
+
17
+ # Empirically, terms longer than about 5 words seem to be either
18
+ # too specific to be useful or very noisy.
19
+ @max_term_length = 5
20
+
21
+ # Common sources of crap starting words
22
+ @proscribed_start = /CC|PRP|IN|DT|PRP\$|WP|WP\$|TO|EX/
23
+
24
+ # We have to end in a noun, foreign word or number.
25
+ @required_ending = /NN|NNS|NNP|NNPS|FW|CD/
26
+
27
+ self.remove_urls = true
28
+ self.remove_paths = true
29
+
30
+ yield self if block_given?
31
+ end
32
+
33
+
34
+ class TermContext
35
+ attr_accessor :parent, :tokens, :postags, :chunks
36
+
37
+ def nlp
38
+ parent.nlp
39
+ end
40
+
41
+ def initialize(parent, sentence)
42
+ @parent = parent
43
+ sentence = NLP.clean_sentence(sentence)
44
+
45
+ # User defineable cleaning.
46
+ sentence = NLP.remove_urls(sentence) if parent.remove_urls
47
+ sentence = NLP.remove_paths(sentence) if parent.remove_paths
48
+
49
+
50
+ @tokens = NLP.tokenize_sentence(sentence)
51
+ @postags = nlp.postagger.tag(tokens)
52
+ @chunks = nlp.chunker.chunk(tokens, postags)
53
+
54
+
55
+ @sentence = sentence
56
+
57
+ end
58
+
59
+ def boundaries
60
+ return @boundaries if @boundaries
61
+
62
+ # To each token we assign three attributes which determine how it may occur within a term.
63
+ # can_cross determines if this token can appear internally in a term
64
+ # can_start determines if a term is allowed to start with this token
65
+ # can_end determines if a term is allowed to end with this token
66
+ @boundaries = tokens.map{|t| {}}
67
+
68
+ @boundaries.each_with_index do |b, i|
69
+ tok = tokens[i]
70
+ pos = postags[i]
71
+ chunk = chunks[i]
72
+
73
+ # Cannot cross commas or coordinating conjections (and, or, etc)
74
+ b[:can_cross] = !(pos =~ /,|CC/)
75
+
76
+ # Cannot cross the beginning of verb terms
77
+ # i.e. we may start with verb terms but not include them
78
+ b[:can_cross] = (chunk != "B-VP") if b[:can_cross]
79
+
80
+ # We generate tags like <PATH>, <URL> and <QUOTE>
81
+ # to encapsulate various sorts of noise strings.
82
+ b[:can_cross] &&= !(tok =~ /<\w+>/)
83
+
84
+ # We are only allowed to start terms on the beginning of a term chunk
85
+ b[:can_start] = (chunks[i] == "B-NP")
86
+ if i > 0
87
+ if postags[i-1] =~ /DT|WDT|PRP|JJR|JJS/
88
+ # In some cases we want to move the start of a term to the right. These cases are:
89
+ # - a determiner (the, a, etc)
90
+ # - a posessive pronoun (my, your, etc)
91
+ # - comparative and superlative adjectives (best, better, etc.)
92
+ # In all cases we only do this for noun terms, and will only move them to internal points.
93
+ b[:can_start] ||= (chunks[i] == "I-NP")
94
+ @boundaries[i - 1][:can_start] = false
95
+ end
96
+ end
97
+
98
+ # We must include any tokens internal to the current chunk
99
+ b[:can_end] = !(chunks[i + 1] =~ /I-/)
100
+
101
+ # It is permitted to cross stopwords, but they cannot lie at the term boundary
102
+ if (nlp.stopword? tok) || (nlp.stopword? tokens[i..i+1].join) # Need to take into account contractions, which span multiple tokens
103
+ b[:can_end] = false
104
+ b[:can_start] = false
105
+ end
106
+
107
+ # The presence of a ' at the start of a token is most likely an indicator that we've
108
+ # split across a contraction. e.g. would've -> would 've. We are not allowed to
109
+ # cross this transition point.
110
+ if tok =~ /^'/
111
+ b[:can_start] = false
112
+ @boundaries[i - 1][:can_end] = false
113
+ end
114
+
115
+ # Must match the requirements for POSes at the beginning and end.
116
+ b[:can_start] &&= !(pos =~ parent.proscribed_start)
117
+ b[:can_end] &&= (pos =~ parent.required_ending)
118
+
119
+ end
120
+
121
+ @boundaries
122
+ end
123
+
124
+ def terms
125
+ return @terms if @terms
126
+
127
+ @terms = []
128
+
129
+ i = 0
130
+ j = 0
131
+ while(i < tokens.length)
132
+ if !boundaries[i][:can_start] || !boundaries[i][:can_cross]
133
+ i += 1
134
+ next
135
+ end
136
+
137
+ j = i if j < i
138
+
139
+ if (j == tokens.length) || !boundaries[j][:can_cross] || (j >= i + parent.max_term_length)
140
+ i += 1
141
+ j = i
142
+ next
143
+ end
144
+
145
+ if !boundaries[j][:can_end]
146
+ j += 1
147
+ next
148
+ end
149
+
150
+ term = tokens[i..j]
151
+ poses = postags.to_a[i..j]
152
+ term = Term.new(TermExtractor.recombobulate_term(term), poses.join("-"))
153
+ terms << term if TermExtractor.allowed_term?(term)
154
+
155
+ j += 1
156
+ end
157
+
158
+ @terms
159
+ end
160
+ end
161
+
162
+ # Extract all terms in a given sentence.
163
+ def extract_terms_from_sentence(sentence)
164
+ TermContext.new(self, sentence).terms
165
+ end
166
+
167
+ def extract_terms_from_text(text)
168
+ if block_given?
169
+ nlp.sentences(text).each_with_index do |s, i|
170
+ terms = extract_terms_from_sentence(s);
171
+ terms.each{|p| p.sentence = i; yield(p) }
172
+ end
173
+ else
174
+ results = []
175
+ extract_terms_from_text(text){ |p| results << p }
176
+ results
177
+ end
178
+ end
179
+
180
+ # Final post filter on terms to determine if they're allowed.
181
+ def self.allowed_term?(p)
182
+ return false if p.pos =~ /^CD(-CD)*$/ # We don't allow things which are just sequences of numbers
183
+ return false if p.to_s.length > 255
184
+ true
185
+ end
186
+
187
+ # Take a sequence of tokens and turn them back into a term.
188
+ def self.recombobulate_term(term)
189
+ term = term.join(" ")
190
+ term.gsub!(/ '/, "'")
191
+ term.gsub!(/ \./, ".")
192
+ term
193
+ end
194
+
195
+ end
Binary file
@@ -0,0 +1,262 @@
1
+ require "fileutils"
2
+ require "java"
3
+ require "term-extractor/opennlp-tools"
4
+ require "term-extractor/maxent-2.5.2"
5
+ require "term-extractor/trove"
6
+ require "term-extractor/snowball"
7
+ require "set"
8
+
9
+ class TermExtractor
10
+ # NLP contains a lot of general NLP related utilities.
11
+ # In particular it contains:
12
+ # - a selection of OpenNLP classes
13
+ # - a snowball stemmer
14
+ # - a stopword list
15
+ #
16
+ # And various utilities built on top of these.
17
+ class NLP
18
+ JV = Java::OpennlpToolsLangEnglish
19
+ include_class("org.tartarus.snowball.ext.englishStemmer") { |x, y| "EnglishStemmer" }
20
+
21
+ def stem(word)
22
+ stemmer.setCurrent(word)
23
+ stemmer.stem
24
+ stemmer.getCurrent
25
+ end
26
+
27
+ def sentdetect
28
+ @sentdetect ||= JV::SentenceDetector.new(loc("sd.bin.gz"))
29
+ end
30
+
31
+ def tagdict
32
+ @tagdict ||= Java::OpennlpToolsPostag::POSDictionary.new(loc("tagdict"), true)
33
+ end
34
+
35
+ def postagger
36
+ @postagger ||= JV::PosTagger.new(loc("tag.bin.gz"), tagdict)
37
+ end
38
+
39
+ def chunker
40
+ @chunker ||= JV::TreebankChunker.new(loc("chunk.bin.gz"))
41
+ end
42
+
43
+ def stopwords
44
+ @stopwords
45
+ end
46
+
47
+ def stemmer
48
+ @stemmer ||= EnglishStemmer.new
49
+ end
50
+
51
+
52
+ def initialize(models)
53
+ @models = models
54
+ @stopwords = Set.new
55
+
56
+ File.open(loc("stopwords")).each_line do |l|
57
+ l.gsub!(/#.+$/, "")
58
+ @stopwords.add clean_for_stopword(l)
59
+ end
60
+ end
61
+
62
+ # Canonicalisation gives a string that in some sense captures the "essential character"
63
+ # of a piece of text. It normalizes it by removing unneccessary words, rearranging, and
64
+ # stripping suffixes.
65
+ # It is not itself intended to be a useful representation of the string, but instead for
66
+ # determining if two strings are equal.
67
+ def canonicalize(str)
68
+ str.
69
+ to_s.
70
+ downcase.
71
+ gsub(/[^\w\s]/, " ").
72
+ split.
73
+ select{|p| !stopword?(p)}.
74
+ map{|p| stem(p) }.
75
+ sort.
76
+ join(" ")
77
+ end
78
+
79
+ def stopword?(word)
80
+ stopwords.include?(clean_for_stopword(word))
81
+ end
82
+
83
+ # Once we have split sentences, we clean them up prior to tokenization. We remove or normalize
84
+ # a bunch of noise sources and get it to a form where distinct tokens are separated by whitespace.
85
+ def NLP.clean_sentence(text)
86
+ text = text.dup
87
+ text.gsub!(/--+/, " -- ") # TODO: What's this for?
88
+
89
+ # Normalize bracket types.
90
+ # TODO: Shouldn't do this inside of tokens.
91
+ text.gsub!(/{\[/, "(")
92
+ text.gsub!(/\}\]/, ")")
93
+
94
+ # We turn most forms of punctuation which are not internal to tokens into commas
95
+ punct = /(\"|\(|\)|;|-|\:|-|\*|,)/
96
+
97
+ # Convert cunning "smart" apostrophes into plain old boring
98
+ # dumb ones.
99
+ text.gsub!(/’/, "'")
100
+
101
+ text.gsub!(/([\w])\.\.+([\w])/){ "#{$1} , #{$2}"}
102
+ text.gsub!(/(^| )#{punct}+/, " , ")
103
+ text.gsub!(/#{punct}( |$)/, " , ")
104
+ text.gsub!(/(\.+ |')/){" #{$1}"}
105
+
106
+ separators = /\//
107
+
108
+ text.gsub!(/ #{separators} /, " , ")
109
+
110
+ # We can be a bit overeager in turning things into commas, so we clear them up here
111
+ # In particular we remove any we've accidentally added to the end of lines and we collapse
112
+ # consecutive ones into a single one.
113
+ text.gsub!(/(,|\.) *,/){ " #{$1} " }
114
+ text.gsub!(/(,| )+$/, "")
115
+ text.gsub!(/^(,| )+/, "")
116
+
117
+ text.gsub!(/((?:\.|\!|\?)+)$/){" #{$1}" }
118
+
119
+ # Clean up superfluous whitespace
120
+ text.gsub!(/\s+/, " ")
121
+ text
122
+ end
123
+
124
+ def NLP.tokenize_sentence(string)
125
+ clean_sentence(string).split
126
+ end
127
+
128
+ Ending = /(!|\?|\.)+/
129
+
130
+ def self.clean_text(text)
131
+ text = text.gsub(/\r(\n?)/, "\n") # Evil microsoft line endings, die die die!
132
+ text.gsub!(/^\s+$/, "") # For convenience, remove all spaces from blank lines
133
+ text.gsub!(/\n\n+/m, ".\n.\n") # Collapse multiple line endings into periods
134
+ text.gsub!(/\n/, " ") # Squash the text onto a single line.
135
+ text.gsub!(/(\d+)\. /){ "#{$1} . " } # We separate out things of the form 1. as these are commonly lists and OpenNLP sentence detection handles them badly
136
+ text.strip!
137
+ text
138
+ end
139
+
140
+ def self.remove_urls(text)
141
+ text.gsub(/\w+:\/\/[^\s]+?(?=\.?(?= |$))/, "<URL>")
142
+ end
143
+
144
+ def self.remove_paths(text)
145
+ text = text.clone
146
+
147
+ # Fragments of windows paths
148
+ text.gsub!(/[\w:\\]*\\[\w:\\]*/, "<PATH>")
149
+
150
+ # fragments of unix paths
151
+ text.gsub!(/\/[\w\/]+/, "<PATH>")
152
+ text.gsub!(/[\w\/]+\//, "<PATH>")
153
+
154
+ while text.gsub!(/<PATH>\s+\w+\s+<PATH>/, "<PATH>")
155
+ # concatenate fragments where we have e.g. <PATH> and <PATH>
156
+ # into single paths. This is to take into account paths containing spaces.
157
+ end
158
+
159
+ text.gsub!(/<PATH>(\s*<PATH)*/, "<PATH>")
160
+ text
161
+ end
162
+
163
+ EmbedBoundaries = [
164
+ ["\"", "\""],
165
+ ["(", ")"],
166
+ ["[", "]"],
167
+ ["{", "}"]
168
+ ].map{|s| s.map{|x| Regexp.quote(x) }}
169
+
170
+ # Normalise a sentence by removing all parenthetical comments and replacing all embedded quotes contained therein
171
+ # Return an array of the sentence and all contained subterms
172
+ def self.extract_embedded_sentences(text)
173
+ text = text.clone
174
+ fragments = [text]
175
+
176
+ l = nil
177
+ begin
178
+ l = fragments.length
179
+
180
+ EmbedBoundaries.each do |s, e|
181
+ replace = if s == e then "<QUOTE>" else "" end
182
+ matcher = /#{s}[^#{s}#{e}\n]*#{e}/
183
+ text.gsub!(matcher) { |frag| fragments << frag[1..-2]; replace }
184
+ end
185
+
186
+ end while fragments.length > l
187
+
188
+ if fragments.length > 1
189
+ fragments = fragments.map{|f| extract_embedded_sentences(f) }.flatten
190
+ end
191
+
192
+ fragments
193
+ end
194
+
195
+ def sentences(string)
196
+ sentdetect.sentDetect(NLP.clean_text(string)).to_a.map{|s| s.strip }.select{|s| (s.length > 0) && !(s =~ /^(\.|!|\?)+$/) }
197
+ end
198
+
199
+ def each_sentence(source)
200
+ lines = []
201
+
202
+ process_lines = lambda{
203
+ text = lines.join("\n").strip
204
+ if text != ""
205
+ sentences(text).each{|s| yield(s.gsub("\n", " ")) }
206
+ end
207
+ lines = []
208
+ }
209
+
210
+ source.each_line do |line|
211
+ line = line.strip
212
+
213
+ if line == ""
214
+ process_lines.call
215
+ end
216
+
217
+ lines << line
218
+ end
219
+
220
+ process_lines.call
221
+ end
222
+
223
+ def postag(tokens)
224
+ if tokens.is_a? String
225
+ tokens = NLP.tokenize_sentence(tokens)
226
+ else
227
+ tokens = tokens.to_a
228
+ end
229
+ tokens.zip(postagger.tag(tokens).to_a)
230
+ end
231
+
232
+ def chunk_text(text)
233
+ result = []
234
+ sentences(text).each{|x| result += chunk_sentence(x)}
235
+ result
236
+ end
237
+
238
+ def chunk_sentence(sentence)
239
+ tokens = NLP.tokenize_sentence(sentence)
240
+ postags = postagger.tag(tokens)
241
+ tokens.zip(chunker.chunk(tokens, postags).to_a)
242
+ end
243
+
244
+ private
245
+ def loc(file)
246
+ File.join(@models, file)
247
+ end
248
+
249
+ def clean_for_stopword(word)
250
+ word.downcase.gsub(/[^\w]/, "")
251
+ end
252
+
253
+ def chunk_type(tag)
254
+ case tag
255
+ when "O"
256
+ "O"
257
+ when /B-(.+)$/
258
+ $1
259
+ end
260
+ end
261
+ end
262
+ end