DRMacIver-term-extractor 0.0.0
Sign up to get free protection for your applications and to get access to all the features.
- data/LICENSE +25 -0
- data/README.markdown +40 -0
- data/Rakefile +56 -0
- data/VERSION +1 -0
- data/bin/terms.rb +8 -0
- data/lib/term-extractor.rb +195 -0
- data/lib/term-extractor/maxent-2.5.2.jar +0 -0
- data/lib/term-extractor/nlp.rb +262 -0
- data/lib/term-extractor/opennlp-tools.jar +0 -0
- data/lib/term-extractor/snowball.jar +0 -0
- data/lib/term-extractor/trove.jar +0 -0
- data/licenses/Maxent +421 -0
- data/licenses/OpenNLP +421 -0
- data/licenses/Trove +504 -0
- data/licenses/snowball.php +33 -0
- data/models/chunk.bin.gz +0 -0
- data/models/sd.bin.gz +0 -0
- data/models/stopwords +567 -0
- data/models/tag.bin.gz +0 -0
- data/models/tagdict +16204 -0
- data/models/tok.bin.gz +0 -0
- data/term-extractor.gemspec +66 -0
- data/test/examples_spec.rb +131 -0
- data/test/files/1.email +37 -0
- data/test/files/juries_seg_8_v1 +20 -0
- data/test/nlp_spec.rb +231 -0
- data/test/term_extractor_spec.rb +141 -0
- metadata +83 -0
data/LICENSE
ADDED
@@ -0,0 +1,25 @@
|
|
1
|
+
Copyright (c) 2009, Trampoline Systems
|
2
|
+
All rights reserved.
|
3
|
+
|
4
|
+
Redistribution and use in source and binary forms, with or without
|
5
|
+
modification, are permitted provided that the following conditions are met:
|
6
|
+
* Redistributions of source code must retain the above copyright
|
7
|
+
notice, this list of conditions and the following disclaimer.
|
8
|
+
* Redistributions in binary form must reproduce the above copyright
|
9
|
+
notice, this list of conditions and the following disclaimer in the
|
10
|
+
documentation and/or other materials provided with the distribution.
|
11
|
+
* Neither the name of Trampoline Systems nor the
|
12
|
+
names of its contributors may be used to endorse or promote products
|
13
|
+
derived from this software without specific prior written permission.
|
14
|
+
|
15
|
+
THIS SOFTWARE IS PROVIDED BY Trampoline Systems ''AS IS'' AND ANY
|
16
|
+
EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
|
17
|
+
WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
|
18
|
+
DISCLAIMED. IN NO EVENT SHALL <copyright holder> BE LIABLE FOR ANY
|
19
|
+
DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
|
20
|
+
(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
|
21
|
+
LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
|
22
|
+
ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
|
23
|
+
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
|
24
|
+
SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
25
|
+
|
data/README.markdown
ADDED
@@ -0,0 +1,40 @@
|
|
1
|
+
# The Trampoline Systems term extractor
|
2
|
+
|
3
|
+
The term extractor is a library for taking natural text and extracting a
|
4
|
+
set of terms from it which make sense without additional context. For example, feeding it the following text from my home page:
|
5
|
+
|
6
|
+
Hi. I’m David.
|
7
|
+
|
8
|
+
I’m also various other things. By training I’m a mathematician,
|
9
|
+
but I seem to have drifted away from that and become a programmer,
|
10
|
+
currently working on natural language processing and social analytic
|
11
|
+
software at Trampoline Systems.
|
12
|
+
|
13
|
+
This site is my public face on the internet. It contains my blog,
|
14
|
+
my OpenID and anything else I want to share with the world.
|
15
|
+
|
16
|
+
We get the following terms:
|
17
|
+
|
18
|
+
David
|
19
|
+
training
|
20
|
+
mathematician
|
21
|
+
programmer
|
22
|
+
natural language processing
|
23
|
+
social analytic software
|
24
|
+
Trampoline Systems
|
25
|
+
site
|
26
|
+
public face
|
27
|
+
public face on the internet
|
28
|
+
internet
|
29
|
+
blog
|
30
|
+
world
|
31
|
+
|
32
|
+
No attempt is made to assign meaning to the terms: They're not guaranteed to represent the content of the document. They're just intended to be coherent snippets of text which you can reuse in a broader context.
|
33
|
+
|
34
|
+
One limitation of this is that it doesn't necessarily extract all reasonable terms. For example "natural language" is a reasonable term for this text which is not included in this. The way we use the term extractor at trampoline is to build a vocabulary of terms we consider interesting and then performing literal string searching for this term - this allows us to be selective in what terms we generate and permissive in looking for matches for them.
|
35
|
+
|
36
|
+
Currently only english is supported. There are plans to support other languages, but nothing is implemented in that regard: It requires someone who is native to that language, a competent programmer and at least passingly familiar with NLP, so understandably we're a bit resource constrained on getting wide spread non-english support.
|
37
|
+
|
38
|
+
## Copyright
|
39
|
+
|
40
|
+
Copyright (c) 2009 Trampoline Systems. See LICENSE for details.
|
data/Rakefile
ADDED
@@ -0,0 +1,56 @@
|
|
1
|
+
require 'rubygems'
|
2
|
+
require 'rake'
|
3
|
+
require 'spec/rake/spectask'
|
4
|
+
|
5
|
+
begin
|
6
|
+
require 'jeweler'
|
7
|
+
Jeweler::Tasks.new do |gem|
|
8
|
+
gem.name = "term-extractor"
|
9
|
+
gem.summary = %Q{A library for extracting useful terms from text}
|
10
|
+
gem.email = "david.maciver@gmail.com"
|
11
|
+
gem.homepage = "http://github.com/david.maciver@gmail.com/term-extractor"
|
12
|
+
gem.authors = ["David R. MacIver"]
|
13
|
+
end
|
14
|
+
|
15
|
+
rescue LoadError
|
16
|
+
puts "Jeweler (or a dependency) not available. Install it with: sudo gem install jeweler"
|
17
|
+
end
|
18
|
+
|
19
|
+
begin
|
20
|
+
require 'rcov/rcovtask'
|
21
|
+
Rcov::RcovTask.new do |test|
|
22
|
+
test.libs << 'test'
|
23
|
+
test.pattern = 'test/**/*_test.rb'
|
24
|
+
test.verbose = true
|
25
|
+
end
|
26
|
+
rescue LoadError
|
27
|
+
task :rcov do
|
28
|
+
abort "RCov is not available. In order to run rcov, you must: sudo gem install spicycode-rcov"
|
29
|
+
end
|
30
|
+
end
|
31
|
+
|
32
|
+
|
33
|
+
task :default => :test
|
34
|
+
|
35
|
+
require 'rake/rdoctask'
|
36
|
+
Rake::RDocTask.new do |rdoc|
|
37
|
+
if File.exist?('VERSION.yml')
|
38
|
+
config = YAML.load(File.read('VERSION.yml'))
|
39
|
+
version = "#{config[:major]}.#{config[:minor]}.#{config[:patch]}"
|
40
|
+
else
|
41
|
+
version = ""
|
42
|
+
end
|
43
|
+
|
44
|
+
rdoc.rdoc_dir = 'rdoc'
|
45
|
+
rdoc.title = "term-extractor #{version}"
|
46
|
+
rdoc.rdoc_files.include('README*')
|
47
|
+
rdoc.rdoc_files.include('lib/**/*.rb')
|
48
|
+
end
|
49
|
+
|
50
|
+
Spec::Rake::SpecTask.new do |t|
|
51
|
+
t.rcov = false
|
52
|
+
t.spec_files = FileList["test/**/*_spec.rb"]
|
53
|
+
t.libs << "./lib"
|
54
|
+
end
|
55
|
+
|
56
|
+
|
data/VERSION
ADDED
@@ -0,0 +1 @@
|
|
1
|
+
0.0.0
|
data/bin/terms.rb
ADDED
@@ -0,0 +1,195 @@
|
|
1
|
+
require "term-extractor/nlp"
|
2
|
+
|
3
|
+
class Term
|
4
|
+
attr_accessor :to_s, :pos, :sentence
|
5
|
+
|
6
|
+
def initialize(ts, pos, sentence = nil)
|
7
|
+
@to_s, @pos, @sentence = ts, pos, sentence
|
8
|
+
end
|
9
|
+
end
|
10
|
+
|
11
|
+
class TermExtractor
|
12
|
+
attr_accessor :nlp, :max_term_length, :proscribed_start, :required_ending, :remove_urls, :remove_paths
|
13
|
+
|
14
|
+
def initialize(models = File.dirname(__FILE__) + "/../models")
|
15
|
+
@nlp = NLP.new(models)
|
16
|
+
|
17
|
+
# Empirically, terms longer than about 5 words seem to be either
|
18
|
+
# too specific to be useful or very noisy.
|
19
|
+
@max_term_length = 5
|
20
|
+
|
21
|
+
# Common sources of crap starting words
|
22
|
+
@proscribed_start = /CC|PRP|IN|DT|PRP\$|WP|WP\$|TO|EX/
|
23
|
+
|
24
|
+
# We have to end in a noun, foreign word or number.
|
25
|
+
@required_ending = /NN|NNS|NNP|NNPS|FW|CD/
|
26
|
+
|
27
|
+
self.remove_urls = true
|
28
|
+
self.remove_paths = true
|
29
|
+
|
30
|
+
yield self if block_given?
|
31
|
+
end
|
32
|
+
|
33
|
+
|
34
|
+
class TermContext
|
35
|
+
attr_accessor :parent, :tokens, :postags, :chunks
|
36
|
+
|
37
|
+
def nlp
|
38
|
+
parent.nlp
|
39
|
+
end
|
40
|
+
|
41
|
+
def initialize(parent, sentence)
|
42
|
+
@parent = parent
|
43
|
+
sentence = NLP.clean_sentence(sentence)
|
44
|
+
|
45
|
+
# User defineable cleaning.
|
46
|
+
sentence = NLP.remove_urls(sentence) if parent.remove_urls
|
47
|
+
sentence = NLP.remove_paths(sentence) if parent.remove_paths
|
48
|
+
|
49
|
+
|
50
|
+
@tokens = NLP.tokenize_sentence(sentence)
|
51
|
+
@postags = nlp.postagger.tag(tokens)
|
52
|
+
@chunks = nlp.chunker.chunk(tokens, postags)
|
53
|
+
|
54
|
+
|
55
|
+
@sentence = sentence
|
56
|
+
|
57
|
+
end
|
58
|
+
|
59
|
+
def boundaries
|
60
|
+
return @boundaries if @boundaries
|
61
|
+
|
62
|
+
# To each token we assign three attributes which determine how it may occur within a term.
|
63
|
+
# can_cross determines if this token can appear internally in a term
|
64
|
+
# can_start determines if a term is allowed to start with this token
|
65
|
+
# can_end determines if a term is allowed to end with this token
|
66
|
+
@boundaries = tokens.map{|t| {}}
|
67
|
+
|
68
|
+
@boundaries.each_with_index do |b, i|
|
69
|
+
tok = tokens[i]
|
70
|
+
pos = postags[i]
|
71
|
+
chunk = chunks[i]
|
72
|
+
|
73
|
+
# Cannot cross commas or coordinating conjections (and, or, etc)
|
74
|
+
b[:can_cross] = !(pos =~ /,|CC/)
|
75
|
+
|
76
|
+
# Cannot cross the beginning of verb terms
|
77
|
+
# i.e. we may start with verb terms but not include them
|
78
|
+
b[:can_cross] = (chunk != "B-VP") if b[:can_cross]
|
79
|
+
|
80
|
+
# We generate tags like <PATH>, <URL> and <QUOTE>
|
81
|
+
# to encapsulate various sorts of noise strings.
|
82
|
+
b[:can_cross] &&= !(tok =~ /<\w+>/)
|
83
|
+
|
84
|
+
# We are only allowed to start terms on the beginning of a term chunk
|
85
|
+
b[:can_start] = (chunks[i] == "B-NP")
|
86
|
+
if i > 0
|
87
|
+
if postags[i-1] =~ /DT|WDT|PRP|JJR|JJS/
|
88
|
+
# In some cases we want to move the start of a term to the right. These cases are:
|
89
|
+
# - a determiner (the, a, etc)
|
90
|
+
# - a posessive pronoun (my, your, etc)
|
91
|
+
# - comparative and superlative adjectives (best, better, etc.)
|
92
|
+
# In all cases we only do this for noun terms, and will only move them to internal points.
|
93
|
+
b[:can_start] ||= (chunks[i] == "I-NP")
|
94
|
+
@boundaries[i - 1][:can_start] = false
|
95
|
+
end
|
96
|
+
end
|
97
|
+
|
98
|
+
# We must include any tokens internal to the current chunk
|
99
|
+
b[:can_end] = !(chunks[i + 1] =~ /I-/)
|
100
|
+
|
101
|
+
# It is permitted to cross stopwords, but they cannot lie at the term boundary
|
102
|
+
if (nlp.stopword? tok) || (nlp.stopword? tokens[i..i+1].join) # Need to take into account contractions, which span multiple tokens
|
103
|
+
b[:can_end] = false
|
104
|
+
b[:can_start] = false
|
105
|
+
end
|
106
|
+
|
107
|
+
# The presence of a ' at the start of a token is most likely an indicator that we've
|
108
|
+
# split across a contraction. e.g. would've -> would 've. We are not allowed to
|
109
|
+
# cross this transition point.
|
110
|
+
if tok =~ /^'/
|
111
|
+
b[:can_start] = false
|
112
|
+
@boundaries[i - 1][:can_end] = false
|
113
|
+
end
|
114
|
+
|
115
|
+
# Must match the requirements for POSes at the beginning and end.
|
116
|
+
b[:can_start] &&= !(pos =~ parent.proscribed_start)
|
117
|
+
b[:can_end] &&= (pos =~ parent.required_ending)
|
118
|
+
|
119
|
+
end
|
120
|
+
|
121
|
+
@boundaries
|
122
|
+
end
|
123
|
+
|
124
|
+
def terms
|
125
|
+
return @terms if @terms
|
126
|
+
|
127
|
+
@terms = []
|
128
|
+
|
129
|
+
i = 0
|
130
|
+
j = 0
|
131
|
+
while(i < tokens.length)
|
132
|
+
if !boundaries[i][:can_start] || !boundaries[i][:can_cross]
|
133
|
+
i += 1
|
134
|
+
next
|
135
|
+
end
|
136
|
+
|
137
|
+
j = i if j < i
|
138
|
+
|
139
|
+
if (j == tokens.length) || !boundaries[j][:can_cross] || (j >= i + parent.max_term_length)
|
140
|
+
i += 1
|
141
|
+
j = i
|
142
|
+
next
|
143
|
+
end
|
144
|
+
|
145
|
+
if !boundaries[j][:can_end]
|
146
|
+
j += 1
|
147
|
+
next
|
148
|
+
end
|
149
|
+
|
150
|
+
term = tokens[i..j]
|
151
|
+
poses = postags.to_a[i..j]
|
152
|
+
term = Term.new(TermExtractor.recombobulate_term(term), poses.join("-"))
|
153
|
+
terms << term if TermExtractor.allowed_term?(term)
|
154
|
+
|
155
|
+
j += 1
|
156
|
+
end
|
157
|
+
|
158
|
+
@terms
|
159
|
+
end
|
160
|
+
end
|
161
|
+
|
162
|
+
# Extract all terms in a given sentence.
|
163
|
+
def extract_terms_from_sentence(sentence)
|
164
|
+
TermContext.new(self, sentence).terms
|
165
|
+
end
|
166
|
+
|
167
|
+
def extract_terms_from_text(text)
|
168
|
+
if block_given?
|
169
|
+
nlp.sentences(text).each_with_index do |s, i|
|
170
|
+
terms = extract_terms_from_sentence(s);
|
171
|
+
terms.each{|p| p.sentence = i; yield(p) }
|
172
|
+
end
|
173
|
+
else
|
174
|
+
results = []
|
175
|
+
extract_terms_from_text(text){ |p| results << p }
|
176
|
+
results
|
177
|
+
end
|
178
|
+
end
|
179
|
+
|
180
|
+
# Final post filter on terms to determine if they're allowed.
|
181
|
+
def self.allowed_term?(p)
|
182
|
+
return false if p.pos =~ /^CD(-CD)*$/ # We don't allow things which are just sequences of numbers
|
183
|
+
return false if p.to_s.length > 255
|
184
|
+
true
|
185
|
+
end
|
186
|
+
|
187
|
+
# Take a sequence of tokens and turn them back into a term.
|
188
|
+
def self.recombobulate_term(term)
|
189
|
+
term = term.join(" ")
|
190
|
+
term.gsub!(/ '/, "'")
|
191
|
+
term.gsub!(/ \./, ".")
|
192
|
+
term
|
193
|
+
end
|
194
|
+
|
195
|
+
end
|
Binary file
|
@@ -0,0 +1,262 @@
|
|
1
|
+
require "fileutils"
|
2
|
+
require "java"
|
3
|
+
require "term-extractor/opennlp-tools"
|
4
|
+
require "term-extractor/maxent-2.5.2"
|
5
|
+
require "term-extractor/trove"
|
6
|
+
require "term-extractor/snowball"
|
7
|
+
require "set"
|
8
|
+
|
9
|
+
class TermExtractor
|
10
|
+
# NLP contains a lot of general NLP related utilities.
|
11
|
+
# In particular it contains:
|
12
|
+
# - a selection of OpenNLP classes
|
13
|
+
# - a snowball stemmer
|
14
|
+
# - a stopword list
|
15
|
+
#
|
16
|
+
# And various utilities built on top of these.
|
17
|
+
class NLP
|
18
|
+
JV = Java::OpennlpToolsLangEnglish
|
19
|
+
include_class("org.tartarus.snowball.ext.englishStemmer") { |x, y| "EnglishStemmer" }
|
20
|
+
|
21
|
+
def stem(word)
|
22
|
+
stemmer.setCurrent(word)
|
23
|
+
stemmer.stem
|
24
|
+
stemmer.getCurrent
|
25
|
+
end
|
26
|
+
|
27
|
+
def sentdetect
|
28
|
+
@sentdetect ||= JV::SentenceDetector.new(loc("sd.bin.gz"))
|
29
|
+
end
|
30
|
+
|
31
|
+
def tagdict
|
32
|
+
@tagdict ||= Java::OpennlpToolsPostag::POSDictionary.new(loc("tagdict"), true)
|
33
|
+
end
|
34
|
+
|
35
|
+
def postagger
|
36
|
+
@postagger ||= JV::PosTagger.new(loc("tag.bin.gz"), tagdict)
|
37
|
+
end
|
38
|
+
|
39
|
+
def chunker
|
40
|
+
@chunker ||= JV::TreebankChunker.new(loc("chunk.bin.gz"))
|
41
|
+
end
|
42
|
+
|
43
|
+
def stopwords
|
44
|
+
@stopwords
|
45
|
+
end
|
46
|
+
|
47
|
+
def stemmer
|
48
|
+
@stemmer ||= EnglishStemmer.new
|
49
|
+
end
|
50
|
+
|
51
|
+
|
52
|
+
def initialize(models)
|
53
|
+
@models = models
|
54
|
+
@stopwords = Set.new
|
55
|
+
|
56
|
+
File.open(loc("stopwords")).each_line do |l|
|
57
|
+
l.gsub!(/#.+$/, "")
|
58
|
+
@stopwords.add clean_for_stopword(l)
|
59
|
+
end
|
60
|
+
end
|
61
|
+
|
62
|
+
# Canonicalisation gives a string that in some sense captures the "essential character"
|
63
|
+
# of a piece of text. It normalizes it by removing unneccessary words, rearranging, and
|
64
|
+
# stripping suffixes.
|
65
|
+
# It is not itself intended to be a useful representation of the string, but instead for
|
66
|
+
# determining if two strings are equal.
|
67
|
+
def canonicalize(str)
|
68
|
+
str.
|
69
|
+
to_s.
|
70
|
+
downcase.
|
71
|
+
gsub(/[^\w\s]/, " ").
|
72
|
+
split.
|
73
|
+
select{|p| !stopword?(p)}.
|
74
|
+
map{|p| stem(p) }.
|
75
|
+
sort.
|
76
|
+
join(" ")
|
77
|
+
end
|
78
|
+
|
79
|
+
def stopword?(word)
|
80
|
+
stopwords.include?(clean_for_stopword(word))
|
81
|
+
end
|
82
|
+
|
83
|
+
# Once we have split sentences, we clean them up prior to tokenization. We remove or normalize
|
84
|
+
# a bunch of noise sources and get it to a form where distinct tokens are separated by whitespace.
|
85
|
+
def NLP.clean_sentence(text)
|
86
|
+
text = text.dup
|
87
|
+
text.gsub!(/--+/, " -- ") # TODO: What's this for?
|
88
|
+
|
89
|
+
# Normalize bracket types.
|
90
|
+
# TODO: Shouldn't do this inside of tokens.
|
91
|
+
text.gsub!(/{\[/, "(")
|
92
|
+
text.gsub!(/\}\]/, ")")
|
93
|
+
|
94
|
+
# We turn most forms of punctuation which are not internal to tokens into commas
|
95
|
+
punct = /(\"|\(|\)|;|-|\:|-|\*|,)/
|
96
|
+
|
97
|
+
# Convert cunning "smart" apostrophes into plain old boring
|
98
|
+
# dumb ones.
|
99
|
+
text.gsub!(/’/, "'")
|
100
|
+
|
101
|
+
text.gsub!(/([\w])\.\.+([\w])/){ "#{$1} , #{$2}"}
|
102
|
+
text.gsub!(/(^| )#{punct}+/, " , ")
|
103
|
+
text.gsub!(/#{punct}( |$)/, " , ")
|
104
|
+
text.gsub!(/(\.+ |')/){" #{$1}"}
|
105
|
+
|
106
|
+
separators = /\//
|
107
|
+
|
108
|
+
text.gsub!(/ #{separators} /, " , ")
|
109
|
+
|
110
|
+
# We can be a bit overeager in turning things into commas, so we clear them up here
|
111
|
+
# In particular we remove any we've accidentally added to the end of lines and we collapse
|
112
|
+
# consecutive ones into a single one.
|
113
|
+
text.gsub!(/(,|\.) *,/){ " #{$1} " }
|
114
|
+
text.gsub!(/(,| )+$/, "")
|
115
|
+
text.gsub!(/^(,| )+/, "")
|
116
|
+
|
117
|
+
text.gsub!(/((?:\.|\!|\?)+)$/){" #{$1}" }
|
118
|
+
|
119
|
+
# Clean up superfluous whitespace
|
120
|
+
text.gsub!(/\s+/, " ")
|
121
|
+
text
|
122
|
+
end
|
123
|
+
|
124
|
+
def NLP.tokenize_sentence(string)
|
125
|
+
clean_sentence(string).split
|
126
|
+
end
|
127
|
+
|
128
|
+
Ending = /(!|\?|\.)+/
|
129
|
+
|
130
|
+
def self.clean_text(text)
|
131
|
+
text = text.gsub(/\r(\n?)/, "\n") # Evil microsoft line endings, die die die!
|
132
|
+
text.gsub!(/^\s+$/, "") # For convenience, remove all spaces from blank lines
|
133
|
+
text.gsub!(/\n\n+/m, ".\n.\n") # Collapse multiple line endings into periods
|
134
|
+
text.gsub!(/\n/, " ") # Squash the text onto a single line.
|
135
|
+
text.gsub!(/(\d+)\. /){ "#{$1} . " } # We separate out things of the form 1. as these are commonly lists and OpenNLP sentence detection handles them badly
|
136
|
+
text.strip!
|
137
|
+
text
|
138
|
+
end
|
139
|
+
|
140
|
+
def self.remove_urls(text)
|
141
|
+
text.gsub(/\w+:\/\/[^\s]+?(?=\.?(?= |$))/, "<URL>")
|
142
|
+
end
|
143
|
+
|
144
|
+
def self.remove_paths(text)
|
145
|
+
text = text.clone
|
146
|
+
|
147
|
+
# Fragments of windows paths
|
148
|
+
text.gsub!(/[\w:\\]*\\[\w:\\]*/, "<PATH>")
|
149
|
+
|
150
|
+
# fragments of unix paths
|
151
|
+
text.gsub!(/\/[\w\/]+/, "<PATH>")
|
152
|
+
text.gsub!(/[\w\/]+\//, "<PATH>")
|
153
|
+
|
154
|
+
while text.gsub!(/<PATH>\s+\w+\s+<PATH>/, "<PATH>")
|
155
|
+
# concatenate fragments where we have e.g. <PATH> and <PATH>
|
156
|
+
# into single paths. This is to take into account paths containing spaces.
|
157
|
+
end
|
158
|
+
|
159
|
+
text.gsub!(/<PATH>(\s*<PATH)*/, "<PATH>")
|
160
|
+
text
|
161
|
+
end
|
162
|
+
|
163
|
+
EmbedBoundaries = [
|
164
|
+
["\"", "\""],
|
165
|
+
["(", ")"],
|
166
|
+
["[", "]"],
|
167
|
+
["{", "}"]
|
168
|
+
].map{|s| s.map{|x| Regexp.quote(x) }}
|
169
|
+
|
170
|
+
# Normalise a sentence by removing all parenthetical comments and replacing all embedded quotes contained therein
|
171
|
+
# Return an array of the sentence and all contained subterms
|
172
|
+
def self.extract_embedded_sentences(text)
|
173
|
+
text = text.clone
|
174
|
+
fragments = [text]
|
175
|
+
|
176
|
+
l = nil
|
177
|
+
begin
|
178
|
+
l = fragments.length
|
179
|
+
|
180
|
+
EmbedBoundaries.each do |s, e|
|
181
|
+
replace = if s == e then "<QUOTE>" else "" end
|
182
|
+
matcher = /#{s}[^#{s}#{e}\n]*#{e}/
|
183
|
+
text.gsub!(matcher) { |frag| fragments << frag[1..-2]; replace }
|
184
|
+
end
|
185
|
+
|
186
|
+
end while fragments.length > l
|
187
|
+
|
188
|
+
if fragments.length > 1
|
189
|
+
fragments = fragments.map{|f| extract_embedded_sentences(f) }.flatten
|
190
|
+
end
|
191
|
+
|
192
|
+
fragments
|
193
|
+
end
|
194
|
+
|
195
|
+
def sentences(string)
|
196
|
+
sentdetect.sentDetect(NLP.clean_text(string)).to_a.map{|s| s.strip }.select{|s| (s.length > 0) && !(s =~ /^(\.|!|\?)+$/) }
|
197
|
+
end
|
198
|
+
|
199
|
+
def each_sentence(source)
|
200
|
+
lines = []
|
201
|
+
|
202
|
+
process_lines = lambda{
|
203
|
+
text = lines.join("\n").strip
|
204
|
+
if text != ""
|
205
|
+
sentences(text).each{|s| yield(s.gsub("\n", " ")) }
|
206
|
+
end
|
207
|
+
lines = []
|
208
|
+
}
|
209
|
+
|
210
|
+
source.each_line do |line|
|
211
|
+
line = line.strip
|
212
|
+
|
213
|
+
if line == ""
|
214
|
+
process_lines.call
|
215
|
+
end
|
216
|
+
|
217
|
+
lines << line
|
218
|
+
end
|
219
|
+
|
220
|
+
process_lines.call
|
221
|
+
end
|
222
|
+
|
223
|
+
def postag(tokens)
|
224
|
+
if tokens.is_a? String
|
225
|
+
tokens = NLP.tokenize_sentence(tokens)
|
226
|
+
else
|
227
|
+
tokens = tokens.to_a
|
228
|
+
end
|
229
|
+
tokens.zip(postagger.tag(tokens).to_a)
|
230
|
+
end
|
231
|
+
|
232
|
+
def chunk_text(text)
|
233
|
+
result = []
|
234
|
+
sentences(text).each{|x| result += chunk_sentence(x)}
|
235
|
+
result
|
236
|
+
end
|
237
|
+
|
238
|
+
def chunk_sentence(sentence)
|
239
|
+
tokens = NLP.tokenize_sentence(sentence)
|
240
|
+
postags = postagger.tag(tokens)
|
241
|
+
tokens.zip(chunker.chunk(tokens, postags).to_a)
|
242
|
+
end
|
243
|
+
|
244
|
+
private
|
245
|
+
def loc(file)
|
246
|
+
File.join(@models, file)
|
247
|
+
end
|
248
|
+
|
249
|
+
def clean_for_stopword(word)
|
250
|
+
word.downcase.gsub(/[^\w]/, "")
|
251
|
+
end
|
252
|
+
|
253
|
+
def chunk_type(tag)
|
254
|
+
case tag
|
255
|
+
when "O"
|
256
|
+
"O"
|
257
|
+
when /B-(.+)$/
|
258
|
+
$1
|
259
|
+
end
|
260
|
+
end
|
261
|
+
end
|
262
|
+
end
|