gulp 0.1.0
Sign up to get free protection for your applications and to get access to all the features.
- data/.document +5 -0
- data/.gitignore +8 -0
- data/CHANGELOG +1 -0
- data/LICENSE +20 -0
- data/README.markdown +25 -0
- data/Rakefile +59 -0
- data/TODO +3 -0
- data/VERSION +1 -0
- data/bin/gulp +94 -0
- data/lib/gulp.rb +23 -0
- data/lib/gulp/corpus.rb +35 -0
- data/lib/gulp/data_store.rb +43 -0
- data/lib/gulp/document.rb +71 -0
- data/lib/gulp/phrase.rb +34 -0
- data/lib/gulp/phrase_extractor.rb +49 -0
- data/test/phrase_extractor_test.rb +45 -0
- data/test/test_helper.rb +13 -0
- metadata +113 -0
data/.document
ADDED
data/.gitignore
ADDED
data/CHANGELOG
ADDED
@@ -0,0 +1 @@
|
|
1
|
+
v0.0.1. Initial release
|
data/LICENSE
ADDED
@@ -0,0 +1,20 @@
|
|
1
|
+
Copyright (c) 2010 Andrew Carpenter
|
2
|
+
|
3
|
+
Permission is hereby granted, free of charge, to any person obtaining
|
4
|
+
a copy of this software and associated documentation files (the
|
5
|
+
"Software"), to deal in the Software without restriction, including
|
6
|
+
without limitation the rights to use, copy, modify, merge, publish,
|
7
|
+
distribute, sublicense, and/or sell copies of the Software, and to
|
8
|
+
permit persons to whom the Software is furnished to do so, subject to
|
9
|
+
the following conditions:
|
10
|
+
|
11
|
+
The above copyright notice and this permission notice shall be
|
12
|
+
included in all copies or substantial portions of the Software.
|
13
|
+
|
14
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
|
15
|
+
EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
|
16
|
+
MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
|
17
|
+
NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
|
18
|
+
LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
|
19
|
+
OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
|
20
|
+
WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
|
data/README.markdown
ADDED
@@ -0,0 +1,25 @@
|
|
1
|
+
# Gulp
|
2
|
+
|
3
|
+
Ruby gem for identifying Statistically Improbable Phrases across a large document set.
|
4
|
+
|
5
|
+
This is pre-alpha; use at your own risk. API will change.
|
6
|
+
|
7
|
+
## Install
|
8
|
+
|
9
|
+
[sudo] gem install gulp
|
10
|
+
|
11
|
+
## Usage
|
12
|
+
|
13
|
+
gulp = Gulp.new(:database_directory => '/path/to/dir')
|
14
|
+
|
15
|
+
gulp.new_from_xml_file(path_1).process!
|
16
|
+
gulp.new_from_xml_file(path_2).process!
|
17
|
+
gulp.new_from_xml_file(path_3).process!
|
18
|
+
|
19
|
+
doc = gulp.new_from_xml_file(path_4).process!
|
20
|
+
doc.process!
|
21
|
+
doc.phrases # => [<Gulp::Phrase>, <Gulp::Phrase>]
|
22
|
+
|
23
|
+
## Copyright
|
24
|
+
|
25
|
+
Copyright (c) 2010 Andrew Carpenter. See LICENSE for details.
|
data/Rakefile
ADDED
@@ -0,0 +1,59 @@
|
|
1
|
+
require 'rubygems'
|
2
|
+
require 'rake'
|
3
|
+
|
4
|
+
begin
|
5
|
+
require 'jeweler'
|
6
|
+
Jeweler::Tasks.new do |gem|
|
7
|
+
gem.name = "gulp"
|
8
|
+
gem.summary = %Q{Identify Statistically Improbable Phrases (SIPs)}
|
9
|
+
gem.email = "andrew.main@gmail.com"
|
10
|
+
gem.homepage = "http://github.com/andrewcarpenter/gulp"
|
11
|
+
gem.authors = ["Andrew Carpenter"]
|
12
|
+
gem.add_dependency 'activesupport'
|
13
|
+
gem.add_dependency 'tokyocabinet'
|
14
|
+
gem.add_dependency 'nokogiri'
|
15
|
+
gem.add_dependency 'trollop'
|
16
|
+
end
|
17
|
+
Jeweler::GemcutterTasks.new
|
18
|
+
rescue LoadError
|
19
|
+
puts "Jeweler (or a dependency) not available. Install it with: sudo gem install jeweler"
|
20
|
+
end
|
21
|
+
|
22
|
+
require 'rake/testtask'
|
23
|
+
Rake::TestTask.new(:test) do |test|
|
24
|
+
test.libs << 'lib' << 'test'
|
25
|
+
test.pattern = 'test/**/*_test.rb'
|
26
|
+
test.verbose = true
|
27
|
+
end
|
28
|
+
|
29
|
+
begin
|
30
|
+
require 'rcov/rcovtask'
|
31
|
+
Rcov::RcovTask.new do |test|
|
32
|
+
test.libs << 'test'
|
33
|
+
test.pattern = 'test/**/*_test.rb'
|
34
|
+
test.verbose = true
|
35
|
+
end
|
36
|
+
rescue LoadError
|
37
|
+
task :rcov do
|
38
|
+
abort "RCov is not available. In order to run rcov, you must: sudo gem install spicycode-rcov"
|
39
|
+
end
|
40
|
+
end
|
41
|
+
|
42
|
+
|
43
|
+
task :default => :test
|
44
|
+
|
45
|
+
require 'rake/rdoctask'
|
46
|
+
Rake::RDocTask.new do |rdoc|
|
47
|
+
if File.exist?('VERSION.yml')
|
48
|
+
config = YAML.load(File.read('VERSION.yml'))
|
49
|
+
version = "#{config[:major]}.#{config[:minor]}.#{config[:patch]}"
|
50
|
+
else
|
51
|
+
version = ""
|
52
|
+
end
|
53
|
+
|
54
|
+
rdoc.rdoc_dir = 'rdoc'
|
55
|
+
rdoc.title = "gulp #{version}"
|
56
|
+
rdoc.rdoc_files.include('README*')
|
57
|
+
rdoc.rdoc_files.include('lib/**/*.rb')
|
58
|
+
end
|
59
|
+
|
data/TODO
ADDED
data/VERSION
ADDED
@@ -0,0 +1 @@
|
|
1
|
+
0.1.0
|
data/bin/gulp
ADDED
@@ -0,0 +1,94 @@
|
|
1
|
+
#!/usr/bin/env ruby
|
2
|
+
|
3
|
+
# TODO: remove this
|
4
|
+
$LOAD_PATH.unshift(File.expand_path(File.dirname(__FILE__)) + '/../lib')
|
5
|
+
|
6
|
+
# TODO: remove this
|
7
|
+
I_KNOW_I_AM_USING_AN_OLD_AND_BUGGY_VERSION_OF_LIBXML2 = 1
|
8
|
+
|
9
|
+
require 'rubygems'
|
10
|
+
require 'trollop'
|
11
|
+
require 'gulp'
|
12
|
+
|
13
|
+
interrupted = false
|
14
|
+
trap("INT") do
|
15
|
+
puts "waiting so can exit cleanly..."
|
16
|
+
@interrupted = true
|
17
|
+
end
|
18
|
+
|
19
|
+
def okay_to_terminate!
|
20
|
+
if @interrupted
|
21
|
+
puts "exited cleanly."
|
22
|
+
exit
|
23
|
+
end
|
24
|
+
end
|
25
|
+
|
26
|
+
SUB_COMMANDS = %w(add sip corpus_dump)
|
27
|
+
global_opts = Trollop::options do
|
28
|
+
banner "statistically improbable phrase extractor"
|
29
|
+
opt :database_directory, "Database directory", :type => String
|
30
|
+
opt :quiet, "Quiet", :short => :q
|
31
|
+
stop_on SUB_COMMANDS
|
32
|
+
end
|
33
|
+
|
34
|
+
cmd = ARGV.shift # get the subcommand
|
35
|
+
cmd_opts = case cmd
|
36
|
+
when "add"
|
37
|
+
Trollop::options do
|
38
|
+
opt :file, "File", :type => String
|
39
|
+
end
|
40
|
+
when "sip"
|
41
|
+
Trollop::options do
|
42
|
+
opt :file, "File", :type => String, :required => true
|
43
|
+
end
|
44
|
+
when "corpus_dump"
|
45
|
+
else
|
46
|
+
Trollop::die "unknown subcommand #{cmd.inspect}"
|
47
|
+
end
|
48
|
+
|
49
|
+
gulp = Gulp.new(:database_directory => global_opts[:database_directory] || '.')
|
50
|
+
|
51
|
+
case cmd
|
52
|
+
when "add"
|
53
|
+
if cmd_opts[:file]
|
54
|
+
files = [cmd_opts[:file]]
|
55
|
+
else
|
56
|
+
files = STDIN.read.split("\n")
|
57
|
+
end
|
58
|
+
|
59
|
+
files.each do |path|
|
60
|
+
puts "processing #{path}..."
|
61
|
+
doc = gulp.new_from_xml_file(path)
|
62
|
+
if doc.already_processed?
|
63
|
+
puts "\talready processed."
|
64
|
+
else
|
65
|
+
doc.process!
|
66
|
+
doc.add_to_corpus!
|
67
|
+
puts "doc has #{doc.number_of_unique_phrases} unique phrases" unless global_opts[:quiet]
|
68
|
+
|
69
|
+
if doc.number_of_unique_phrases == 0
|
70
|
+
puts "no phrases?!?"
|
71
|
+
doc.phrase_counts.each do |p, c|
|
72
|
+
puts "#{p} => #{c}"
|
73
|
+
end
|
74
|
+
end
|
75
|
+
puts "corpus has #{gulp.corpus.number_of_unique_phrases} unique phrases" unless global_opts[:quiet]
|
76
|
+
puts "corpus has #{gulp.corpus.total_number_of_documents} unique documents" unless global_opts[:quiet]
|
77
|
+
end
|
78
|
+
okay_to_terminate!
|
79
|
+
end
|
80
|
+
when "sip"
|
81
|
+
doc = gulp.new_from_xml_file(cmd_opts[:file])
|
82
|
+
doc.process!
|
83
|
+
phrases = doc.phrases
|
84
|
+
|
85
|
+
phrases.sort_by{|p| p.score}.each do |phrase|
|
86
|
+
puts "#{phrase.string} (#{phrase.count})=> #{phrase.score}"
|
87
|
+
okay_to_terminate!
|
88
|
+
end
|
89
|
+
when "corpus_dump"
|
90
|
+
gulp.corpus.phrase_document_counts.map do |phrase, count|
|
91
|
+
puts "#{phrase} => #{count}"
|
92
|
+
okay_to_terminate!
|
93
|
+
end
|
94
|
+
end
|
data/lib/gulp.rb
ADDED
@@ -0,0 +1,23 @@
|
|
1
|
+
require 'nokogiri'
|
2
|
+
require 'active_support'
|
3
|
+
require 'tokyocabinet'
|
4
|
+
|
5
|
+
class Gulp
|
6
|
+
VERSION = '0.0.1'
|
7
|
+
|
8
|
+
attr_reader :corpus
|
9
|
+
def initialize(options)
|
10
|
+
@corpus = Corpus.new(options[:database_directory])
|
11
|
+
end
|
12
|
+
|
13
|
+
def new_from_xml_file(path)
|
14
|
+
Gulp::Document.new(path, @corpus)
|
15
|
+
end
|
16
|
+
end
|
17
|
+
|
18
|
+
require "gulp/corpus"
|
19
|
+
require "gulp/data_store"
|
20
|
+
require "gulp/document"
|
21
|
+
require "gulp/phrase"
|
22
|
+
require "gulp/phrase_extractor"
|
23
|
+
|
data/lib/gulp/corpus.rb
ADDED
@@ -0,0 +1,35 @@
|
|
1
|
+
class Gulp
|
2
|
+
class Corpus
|
3
|
+
|
4
|
+
attr_reader :phrase_document_counts
|
5
|
+
def initialize(database_directory)
|
6
|
+
@database_directory = database_directory
|
7
|
+
@processed_documents = Gulp::DataStore.new("#{@database_directory}/processed_documents")
|
8
|
+
@phrase_document_counts = Gulp::DataStore.new("#{@database_directory}/phrase_document_counts")
|
9
|
+
end
|
10
|
+
|
11
|
+
def mark_as_processed!(document_name)
|
12
|
+
@processed_documents.increment(document_name)
|
13
|
+
end
|
14
|
+
|
15
|
+
def already_processed?(document_name)
|
16
|
+
@processed_documents.has_key?(document_name)
|
17
|
+
end
|
18
|
+
|
19
|
+
def total_number_of_documents
|
20
|
+
@processed_documents.size
|
21
|
+
end
|
22
|
+
|
23
|
+
def number_of_unique_phrases
|
24
|
+
@phrase_document_counts.size
|
25
|
+
end
|
26
|
+
|
27
|
+
def increment_phrase_document_count(phrase)
|
28
|
+
@phrase_document_counts.increment(phrase)
|
29
|
+
end
|
30
|
+
|
31
|
+
def phrase_document_count(phrase)
|
32
|
+
@phrase_document_counts[phrase]
|
33
|
+
end
|
34
|
+
end
|
35
|
+
end
|
@@ -0,0 +1,43 @@
|
|
1
|
+
class Gulp
|
2
|
+
class DataStore
|
3
|
+
include TokyoCabinet
|
4
|
+
include Enumerable
|
5
|
+
|
6
|
+
def initialize(path)
|
7
|
+
@hdb = HDB::new
|
8
|
+
@hdb.open(path + '.hdb', HDB::OWRITER | HDB::OCREAT)
|
9
|
+
end
|
10
|
+
|
11
|
+
def increment(key)
|
12
|
+
@hdb.addint(key,1)
|
13
|
+
end
|
14
|
+
|
15
|
+
def [](key)
|
16
|
+
val = @hdb[key]
|
17
|
+
val ? val.unpack('i').first : 0
|
18
|
+
end
|
19
|
+
|
20
|
+
def []=(key, value)
|
21
|
+
@hdb[key] = value
|
22
|
+
end
|
23
|
+
|
24
|
+
def has_key?(key)
|
25
|
+
@hdb[key].present?
|
26
|
+
end
|
27
|
+
|
28
|
+
def clear!
|
29
|
+
@hdb.vanish
|
30
|
+
end
|
31
|
+
|
32
|
+
def size
|
33
|
+
@hdb.rnum
|
34
|
+
end
|
35
|
+
|
36
|
+
def each_key(&proc)
|
37
|
+
@hdb.each_key(&proc)
|
38
|
+
end
|
39
|
+
def each(&proc)
|
40
|
+
@hdb.each(&proc)
|
41
|
+
end
|
42
|
+
end
|
43
|
+
end
|
@@ -0,0 +1,71 @@
|
|
1
|
+
class Gulp
|
2
|
+
class Document
|
3
|
+
attr_reader :name, :corpus, :word_count, :phrase_counts
|
4
|
+
|
5
|
+
def initialize(name, corpus)
|
6
|
+
@name = name
|
7
|
+
@corpus = corpus
|
8
|
+
@word_count = 0
|
9
|
+
@finalized = false
|
10
|
+
@phrase_counts = {}#Gulp::DataStore.new('document')
|
11
|
+
@extractor = Gulp::PhraseExtractor.new
|
12
|
+
end
|
13
|
+
|
14
|
+
def process!
|
15
|
+
extractor = XMLTextExtractor.new(self)
|
16
|
+
Nokogiri::XML::SAX::Parser.new(extractor).parse(File.open(name))
|
17
|
+
self
|
18
|
+
end
|
19
|
+
|
20
|
+
def already_processed?
|
21
|
+
@corpus.already_processed?(name)
|
22
|
+
end
|
23
|
+
|
24
|
+
def finalized?
|
25
|
+
@finalized
|
26
|
+
end
|
27
|
+
|
28
|
+
def add_to_corpus!
|
29
|
+
unless already_processed?
|
30
|
+
@finalized = true
|
31
|
+
@phrase_counts.each_key do |phrase|
|
32
|
+
@corpus.increment_phrase_document_count(phrase)
|
33
|
+
end
|
34
|
+
|
35
|
+
@corpus.mark_as_processed!(name)
|
36
|
+
end
|
37
|
+
end
|
38
|
+
|
39
|
+
def add_text(text)
|
40
|
+
raise "cannot add text once finalized" if finalized?
|
41
|
+
word_count, phrases = @extractor.extract(text)
|
42
|
+
@word_count += word_count
|
43
|
+
|
44
|
+
phrases.each do |phrase|
|
45
|
+
@phrase_counts[phrase] ||= 0
|
46
|
+
@phrase_counts[phrase] += 1
|
47
|
+
end
|
48
|
+
end
|
49
|
+
|
50
|
+
def number_of_unique_phrases
|
51
|
+
phrase_counts.size
|
52
|
+
end
|
53
|
+
|
54
|
+
def phrases
|
55
|
+
phrase_counts.map do |phrase, count|
|
56
|
+
Phrase.new(self, phrase, count)
|
57
|
+
end
|
58
|
+
end
|
59
|
+
end
|
60
|
+
|
61
|
+
class XMLTextExtractor < Nokogiri::XML::SAX::Document
|
62
|
+
def initialize(phrase_extractor)
|
63
|
+
super()
|
64
|
+
@phrase_extractor = phrase_extractor
|
65
|
+
end
|
66
|
+
|
67
|
+
def characters(text)
|
68
|
+
@phrase_extractor.add_text(text)
|
69
|
+
end
|
70
|
+
end
|
71
|
+
end
|
data/lib/gulp/phrase.rb
ADDED
@@ -0,0 +1,34 @@
|
|
1
|
+
class Gulp
|
2
|
+
class Phrase
|
3
|
+
attr_accessor :document, :string, :count
|
4
|
+
def initialize(document, string, count)
|
5
|
+
@document = document
|
6
|
+
@string = string
|
7
|
+
@count = count
|
8
|
+
end
|
9
|
+
|
10
|
+
def words
|
11
|
+
words = string.split(/ /)
|
12
|
+
end
|
13
|
+
|
14
|
+
def phrase_size
|
15
|
+
words.size
|
16
|
+
end
|
17
|
+
|
18
|
+
def term_frequency
|
19
|
+
(count * phrase_size) / document.word_count.to_f
|
20
|
+
end
|
21
|
+
|
22
|
+
def number_of_documents_with_term
|
23
|
+
document.corpus.phrase_document_count(string)
|
24
|
+
end
|
25
|
+
|
26
|
+
def inverse_document_frequency
|
27
|
+
Math.log(document.corpus.total_number_of_documents / (1+number_of_documents_with_term))
|
28
|
+
end
|
29
|
+
|
30
|
+
def score
|
31
|
+
term_frequency * inverse_document_frequency
|
32
|
+
end
|
33
|
+
end
|
34
|
+
end
|
@@ -0,0 +1,49 @@
|
|
1
|
+
class Gulp
|
2
|
+
class PhraseExtractor
|
3
|
+
ALLOWED_PHRASE_LENGTHS = [2,3,4]
|
4
|
+
STOPWORDS = %w(a an and except from has in into is made of one that the these this to with)
|
5
|
+
|
6
|
+
def extract(text)
|
7
|
+
strings = chunk_text(preprocess_text(text))
|
8
|
+
phrases = []
|
9
|
+
word_count = 0
|
10
|
+
strings.each do |string|
|
11
|
+
words = string.split(/\s+/)
|
12
|
+
word_count += words.size
|
13
|
+
|
14
|
+
next if words.size == 0
|
15
|
+
|
16
|
+
ALLOWED_PHRASE_LENGTHS.each do |length|
|
17
|
+
final_start_position = words.size - length
|
18
|
+
(0..final_start_position).each do |start_position|
|
19
|
+
sub_phrase_words = words.slice(start_position, length)
|
20
|
+
|
21
|
+
next if STOPWORDS.include?(sub_phrase_words.first.downcase) || STOPWORDS.include?(sub_phrase_words.last.downcase)
|
22
|
+
|
23
|
+
phrases << sub_phrase_words.join(' ')
|
24
|
+
end
|
25
|
+
end
|
26
|
+
end
|
27
|
+
return [word_count, phrases]
|
28
|
+
end
|
29
|
+
|
30
|
+
private
|
31
|
+
def preprocess_text(text)
|
32
|
+
# remove parentheticals
|
33
|
+
text.gsub!(/\(.+?\)/, ' ')
|
34
|
+
text.gsub!(/\[.+?\]/, ' ')
|
35
|
+
text.gsub!(/\{.+?\}/, ' ')
|
36
|
+
|
37
|
+
text
|
38
|
+
end
|
39
|
+
|
40
|
+
def postprocess_text(text)
|
41
|
+
text.gsub!(/[^ a-zA-Z0-9-]/,'')
|
42
|
+
text
|
43
|
+
end
|
44
|
+
|
45
|
+
def chunk_text(text)
|
46
|
+
text.split(/\.|,|:|;|\|/).compact.map{|s| s.gsub(/^\s+|\s+$/,'').gsub(/\s+/, ' ')}.reject{|s| s =~ /^\s*$/}
|
47
|
+
end
|
48
|
+
end
|
49
|
+
end
|
@@ -0,0 +1,45 @@
|
|
1
|
+
require 'test_helper'
|
2
|
+
|
3
|
+
class PhraseExtractorTest < Test::Unit::TestCase
|
4
|
+
def setup
|
5
|
+
@extractor = Gulp::PhraseExtractor.new
|
6
|
+
end
|
7
|
+
|
8
|
+
def phrases_for(text)
|
9
|
+
word_count, phrases = @extractor.extract(text)
|
10
|
+
phrases
|
11
|
+
end
|
12
|
+
|
13
|
+
should "chunk phrases combinatorially" do
|
14
|
+
assert_equal(["y z"], phrases_for("y z"))
|
15
|
+
assert_equal(["x y", "y z", "x y z"], phrases_for("x y z"))
|
16
|
+
assert_equal(["w x", "x y", "y z", "w x y", "x y z", "w x y z"], phrases_for("w x y z"))
|
17
|
+
end
|
18
|
+
|
19
|
+
should "skip phrases starting with a stopword" do
|
20
|
+
assert_equal([], phrases_for("the cow"))
|
21
|
+
assert_equal(["cow jumped"], phrases_for("the cow jumped"))
|
22
|
+
end
|
23
|
+
|
24
|
+
should "skip phrases ending with a stopword" do
|
25
|
+
assert_equal([], phrases_for("cow of"))
|
26
|
+
assert_equal(["fancy cow"], phrases_for("fancy cow of"))
|
27
|
+
end
|
28
|
+
|
29
|
+
should "split phrases on punctuation" do
|
30
|
+
punctuation_chars = %w(. , ; : |)
|
31
|
+
punctuation_chars.each do |char|
|
32
|
+
assert_equal ["w x", "y z"], phrases_for("w x#{char} y z")
|
33
|
+
end
|
34
|
+
end
|
35
|
+
|
36
|
+
should "normalize whitespace" do
|
37
|
+
assert_equal ["y z"], phrases_for("y z ")
|
38
|
+
assert_equal ["y z"], phrases_for(" y z")
|
39
|
+
assert_equal ["y z"], phrases_for(" y z ")
|
40
|
+
end
|
41
|
+
|
42
|
+
should "remove parentheticals first" do
|
43
|
+
assert_equal ["y z"], phrases_for("y (alpha beta) z")
|
44
|
+
end
|
45
|
+
end
|
data/test/test_helper.rb
ADDED
@@ -0,0 +1,13 @@
|
|
1
|
+
require 'rubygems'
|
2
|
+
require 'test/unit'
|
3
|
+
require 'shoulda'
|
4
|
+
|
5
|
+
# TODO: remove this
|
6
|
+
I_KNOW_I_AM_USING_AN_OLD_AND_BUGGY_VERSION_OF_LIBXML2 = 1
|
7
|
+
|
8
|
+
$LOAD_PATH.unshift(File.join(File.dirname(__FILE__), '..', 'lib'))
|
9
|
+
$LOAD_PATH.unshift(File.dirname(__FILE__))
|
10
|
+
require 'gulp'
|
11
|
+
|
12
|
+
class Test::Unit::TestCase
|
13
|
+
end
|
metadata
ADDED
@@ -0,0 +1,113 @@
|
|
1
|
+
--- !ruby/object:Gem::Specification
|
2
|
+
name: gulp
|
3
|
+
version: !ruby/object:Gem::Version
|
4
|
+
version: 0.1.0
|
5
|
+
platform: ruby
|
6
|
+
authors:
|
7
|
+
- Andrew Carpenter
|
8
|
+
autorequire:
|
9
|
+
bindir: bin
|
10
|
+
cert_chain: []
|
11
|
+
|
12
|
+
date: 2010-01-23 00:00:00 -08:00
|
13
|
+
default_executable: gulp
|
14
|
+
dependencies:
|
15
|
+
- !ruby/object:Gem::Dependency
|
16
|
+
name: activesupport
|
17
|
+
type: :runtime
|
18
|
+
version_requirement:
|
19
|
+
version_requirements: !ruby/object:Gem::Requirement
|
20
|
+
requirements:
|
21
|
+
- - ">="
|
22
|
+
- !ruby/object:Gem::Version
|
23
|
+
version: "0"
|
24
|
+
version:
|
25
|
+
- !ruby/object:Gem::Dependency
|
26
|
+
name: tokyocabinet
|
27
|
+
type: :runtime
|
28
|
+
version_requirement:
|
29
|
+
version_requirements: !ruby/object:Gem::Requirement
|
30
|
+
requirements:
|
31
|
+
- - ">="
|
32
|
+
- !ruby/object:Gem::Version
|
33
|
+
version: "0"
|
34
|
+
version:
|
35
|
+
- !ruby/object:Gem::Dependency
|
36
|
+
name: nokogiri
|
37
|
+
type: :runtime
|
38
|
+
version_requirement:
|
39
|
+
version_requirements: !ruby/object:Gem::Requirement
|
40
|
+
requirements:
|
41
|
+
- - ">="
|
42
|
+
- !ruby/object:Gem::Version
|
43
|
+
version: "0"
|
44
|
+
version:
|
45
|
+
- !ruby/object:Gem::Dependency
|
46
|
+
name: trollop
|
47
|
+
type: :runtime
|
48
|
+
version_requirement:
|
49
|
+
version_requirements: !ruby/object:Gem::Requirement
|
50
|
+
requirements:
|
51
|
+
- - ">="
|
52
|
+
- !ruby/object:Gem::Version
|
53
|
+
version: "0"
|
54
|
+
version:
|
55
|
+
description:
|
56
|
+
email: andrew.main@gmail.com
|
57
|
+
executables:
|
58
|
+
- gulp
|
59
|
+
extensions: []
|
60
|
+
|
61
|
+
extra_rdoc_files:
|
62
|
+
- LICENSE
|
63
|
+
- README.markdown
|
64
|
+
- TODO
|
65
|
+
files:
|
66
|
+
- .document
|
67
|
+
- .gitignore
|
68
|
+
- CHANGELOG
|
69
|
+
- LICENSE
|
70
|
+
- README.markdown
|
71
|
+
- Rakefile
|
72
|
+
- TODO
|
73
|
+
- VERSION
|
74
|
+
- bin/gulp
|
75
|
+
- lib/gulp.rb
|
76
|
+
- lib/gulp/corpus.rb
|
77
|
+
- lib/gulp/data_store.rb
|
78
|
+
- lib/gulp/document.rb
|
79
|
+
- lib/gulp/phrase.rb
|
80
|
+
- lib/gulp/phrase_extractor.rb
|
81
|
+
- test/phrase_extractor_test.rb
|
82
|
+
- test/test_helper.rb
|
83
|
+
has_rdoc: true
|
84
|
+
homepage: http://github.com/andrewcarpenter/gulp
|
85
|
+
licenses: []
|
86
|
+
|
87
|
+
post_install_message:
|
88
|
+
rdoc_options:
|
89
|
+
- --charset=UTF-8
|
90
|
+
require_paths:
|
91
|
+
- lib
|
92
|
+
required_ruby_version: !ruby/object:Gem::Requirement
|
93
|
+
requirements:
|
94
|
+
- - ">="
|
95
|
+
- !ruby/object:Gem::Version
|
96
|
+
version: "0"
|
97
|
+
version:
|
98
|
+
required_rubygems_version: !ruby/object:Gem::Requirement
|
99
|
+
requirements:
|
100
|
+
- - ">="
|
101
|
+
- !ruby/object:Gem::Version
|
102
|
+
version: "0"
|
103
|
+
version:
|
104
|
+
requirements: []
|
105
|
+
|
106
|
+
rubyforge_project:
|
107
|
+
rubygems_version: 1.3.5
|
108
|
+
signing_key:
|
109
|
+
specification_version: 3
|
110
|
+
summary: Identify Statistically Improbable Phrases (SIPs)
|
111
|
+
test_files:
|
112
|
+
- test/phrase_extractor_test.rb
|
113
|
+
- test/test_helper.rb
|