gulp 0.1.0

Sign up to get free protection for your applications and to get access to all the features.
@@ -0,0 +1,5 @@
1
+ README.rdoc
2
+ lib/**/*.rb
3
+ bin/*
4
+ features/**/*.feature
5
+ LICENSE
@@ -0,0 +1,8 @@
1
+ *.sw?
2
+ .DS_Store
3
+ coverage
4
+ rdoc
5
+ pkg
6
+ doc
7
+ tmp
8
+ *.hdb
@@ -0,0 +1 @@
1
+ v0.0.1. Initial release
data/LICENSE ADDED
@@ -0,0 +1,20 @@
1
+ Copyright (c) 2010 Andrew Carpenter
2
+
3
+ Permission is hereby granted, free of charge, to any person obtaining
4
+ a copy of this software and associated documentation files (the
5
+ "Software"), to deal in the Software without restriction, including
6
+ without limitation the rights to use, copy, modify, merge, publish,
7
+ distribute, sublicense, and/or sell copies of the Software, and to
8
+ permit persons to whom the Software is furnished to do so, subject to
9
+ the following conditions:
10
+
11
+ The above copyright notice and this permission notice shall be
12
+ included in all copies or substantial portions of the Software.
13
+
14
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
15
+ EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
16
+ MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
17
+ NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
18
+ LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
19
+ OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
20
+ WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
@@ -0,0 +1,25 @@
1
+ # Gulp
2
+
3
+ Ruby gem for identifying Statistically Improbable Phrases across a large document set.
4
+
5
+ This is pre-alpha; use at your own risk. API will change.
6
+
7
+ ## Install
8
+
9
+ [sudo] gem install gulp
10
+
11
+ ## Usage
12
+
13
+ gulp = Gulp.new(:database_directory => '/path/to/dir')
14
+
15
+ gulp.new_from_xml_file(path_1).process!
16
+ gulp.new_from_xml_file(path_2).process!
17
+ gulp.new_from_xml_file(path_3).process!
18
+
19
+ doc = gulp.new_from_xml_file(path_4).process!
20
+ doc.process!
21
+ doc.phrases # => [<Gulp::Phrase>, <Gulp::Phrase>]
22
+
23
+ ## Copyright
24
+
25
+ Copyright (c) 2010 Andrew Carpenter. See LICENSE for details.
@@ -0,0 +1,59 @@
1
+ require 'rubygems'
2
+ require 'rake'
3
+
4
+ begin
5
+ require 'jeweler'
6
+ Jeweler::Tasks.new do |gem|
7
+ gem.name = "gulp"
8
+ gem.summary = %Q{Identify Statistically Improbable Phrases (SIPs)}
9
+ gem.email = "andrew.main@gmail.com"
10
+ gem.homepage = "http://github.com/andrewcarpenter/gulp"
11
+ gem.authors = ["Andrew Carpenter"]
12
+ gem.add_dependency 'activesupport'
13
+ gem.add_dependency 'tokyocabinet'
14
+ gem.add_dependency 'nokogiri'
15
+ gem.add_dependency 'trollop'
16
+ end
17
+ Jeweler::GemcutterTasks.new
18
+ rescue LoadError
19
+ puts "Jeweler (or a dependency) not available. Install it with: sudo gem install jeweler"
20
+ end
21
+
22
+ require 'rake/testtask'
23
+ Rake::TestTask.new(:test) do |test|
24
+ test.libs << 'lib' << 'test'
25
+ test.pattern = 'test/**/*_test.rb'
26
+ test.verbose = true
27
+ end
28
+
29
+ begin
30
+ require 'rcov/rcovtask'
31
+ Rcov::RcovTask.new do |test|
32
+ test.libs << 'test'
33
+ test.pattern = 'test/**/*_test.rb'
34
+ test.verbose = true
35
+ end
36
+ rescue LoadError
37
+ task :rcov do
38
+ abort "RCov is not available. In order to run rcov, you must: sudo gem install spicycode-rcov"
39
+ end
40
+ end
41
+
42
+
43
+ task :default => :test
44
+
45
+ require 'rake/rdoctask'
46
+ Rake::RDocTask.new do |rdoc|
47
+ if File.exist?('VERSION.yml')
48
+ config = YAML.load(File.read('VERSION.yml'))
49
+ version = "#{config[:major]}.#{config[:minor]}.#{config[:patch]}"
50
+ else
51
+ version = ""
52
+ end
53
+
54
+ rdoc.rdoc_dir = 'rdoc'
55
+ rdoc.title = "gulp #{version}"
56
+ rdoc.rdoc_files.include('README*')
57
+ rdoc.rdoc_files.include('lib/**/*.rb')
58
+ end
59
+
data/TODO ADDED
@@ -0,0 +1,3 @@
1
+ * Port to mongo
2
+ * Use data store for individual document's phrase_counts
3
+ * gemify
data/VERSION ADDED
@@ -0,0 +1 @@
1
+ 0.1.0
@@ -0,0 +1,94 @@
1
+ #!/usr/bin/env ruby
2
+
3
+ # TODO: remove this
4
+ $LOAD_PATH.unshift(File.expand_path(File.dirname(__FILE__)) + '/../lib')
5
+
6
+ # TODO: remove this
7
+ I_KNOW_I_AM_USING_AN_OLD_AND_BUGGY_VERSION_OF_LIBXML2 = 1
8
+
9
+ require 'rubygems'
10
+ require 'trollop'
11
+ require 'gulp'
12
+
13
+ interrupted = false
14
+ trap("INT") do
15
+ puts "waiting so can exit cleanly..."
16
+ @interrupted = true
17
+ end
18
+
19
+ def okay_to_terminate!
20
+ if @interrupted
21
+ puts "exited cleanly."
22
+ exit
23
+ end
24
+ end
25
+
26
+ SUB_COMMANDS = %w(add sip corpus_dump)
27
+ global_opts = Trollop::options do
28
+ banner "statistically improbable phrase extractor"
29
+ opt :database_directory, "Database directory", :type => String
30
+ opt :quiet, "Quiet", :short => :q
31
+ stop_on SUB_COMMANDS
32
+ end
33
+
34
+ cmd = ARGV.shift # get the subcommand
35
+ cmd_opts = case cmd
36
+ when "add"
37
+ Trollop::options do
38
+ opt :file, "File", :type => String
39
+ end
40
+ when "sip"
41
+ Trollop::options do
42
+ opt :file, "File", :type => String, :required => true
43
+ end
44
+ when "corpus_dump"
45
+ else
46
+ Trollop::die "unknown subcommand #{cmd.inspect}"
47
+ end
48
+
49
+ gulp = Gulp.new(:database_directory => global_opts[:database_directory] || '.')
50
+
51
+ case cmd
52
+ when "add"
53
+ if cmd_opts[:file]
54
+ files = [cmd_opts[:file]]
55
+ else
56
+ files = STDIN.read.split("\n")
57
+ end
58
+
59
+ files.each do |path|
60
+ puts "processing #{path}..."
61
+ doc = gulp.new_from_xml_file(path)
62
+ if doc.already_processed?
63
+ puts "\talready processed."
64
+ else
65
+ doc.process!
66
+ doc.add_to_corpus!
67
+ puts "doc has #{doc.number_of_unique_phrases} unique phrases" unless global_opts[:quiet]
68
+
69
+ if doc.number_of_unique_phrases == 0
70
+ puts "no phrases?!?"
71
+ doc.phrase_counts.each do |p, c|
72
+ puts "#{p} => #{c}"
73
+ end
74
+ end
75
+ puts "corpus has #{gulp.corpus.number_of_unique_phrases} unique phrases" unless global_opts[:quiet]
76
+ puts "corpus has #{gulp.corpus.total_number_of_documents} unique documents" unless global_opts[:quiet]
77
+ end
78
+ okay_to_terminate!
79
+ end
80
+ when "sip"
81
+ doc = gulp.new_from_xml_file(cmd_opts[:file])
82
+ doc.process!
83
+ phrases = doc.phrases
84
+
85
+ phrases.sort_by{|p| p.score}.each do |phrase|
86
+ puts "#{phrase.string} (#{phrase.count})=> #{phrase.score}"
87
+ okay_to_terminate!
88
+ end
89
+ when "corpus_dump"
90
+ gulp.corpus.phrase_document_counts.map do |phrase, count|
91
+ puts "#{phrase} => #{count}"
92
+ okay_to_terminate!
93
+ end
94
+ end
@@ -0,0 +1,23 @@
1
+ require 'nokogiri'
2
+ require 'active_support'
3
+ require 'tokyocabinet'
4
+
5
+ class Gulp
6
+ VERSION = '0.0.1'
7
+
8
+ attr_reader :corpus
9
+ def initialize(options)
10
+ @corpus = Corpus.new(options[:database_directory])
11
+ end
12
+
13
+ def new_from_xml_file(path)
14
+ Gulp::Document.new(path, @corpus)
15
+ end
16
+ end
17
+
18
+ require "gulp/corpus"
19
+ require "gulp/data_store"
20
+ require "gulp/document"
21
+ require "gulp/phrase"
22
+ require "gulp/phrase_extractor"
23
+
@@ -0,0 +1,35 @@
1
+ class Gulp
2
+ class Corpus
3
+
4
+ attr_reader :phrase_document_counts
5
+ def initialize(database_directory)
6
+ @database_directory = database_directory
7
+ @processed_documents = Gulp::DataStore.new("#{@database_directory}/processed_documents")
8
+ @phrase_document_counts = Gulp::DataStore.new("#{@database_directory}/phrase_document_counts")
9
+ end
10
+
11
+ def mark_as_processed!(document_name)
12
+ @processed_documents.increment(document_name)
13
+ end
14
+
15
+ def already_processed?(document_name)
16
+ @processed_documents.has_key?(document_name)
17
+ end
18
+
19
+ def total_number_of_documents
20
+ @processed_documents.size
21
+ end
22
+
23
+ def number_of_unique_phrases
24
+ @phrase_document_counts.size
25
+ end
26
+
27
+ def increment_phrase_document_count(phrase)
28
+ @phrase_document_counts.increment(phrase)
29
+ end
30
+
31
+ def phrase_document_count(phrase)
32
+ @phrase_document_counts[phrase]
33
+ end
34
+ end
35
+ end
@@ -0,0 +1,43 @@
1
+ class Gulp
2
+ class DataStore
3
+ include TokyoCabinet
4
+ include Enumerable
5
+
6
+ def initialize(path)
7
+ @hdb = HDB::new
8
+ @hdb.open(path + '.hdb', HDB::OWRITER | HDB::OCREAT)
9
+ end
10
+
11
+ def increment(key)
12
+ @hdb.addint(key,1)
13
+ end
14
+
15
+ def [](key)
16
+ val = @hdb[key]
17
+ val ? val.unpack('i').first : 0
18
+ end
19
+
20
+ def []=(key, value)
21
+ @hdb[key] = value
22
+ end
23
+
24
+ def has_key?(key)
25
+ @hdb[key].present?
26
+ end
27
+
28
+ def clear!
29
+ @hdb.vanish
30
+ end
31
+
32
+ def size
33
+ @hdb.rnum
34
+ end
35
+
36
+ def each_key(&proc)
37
+ @hdb.each_key(&proc)
38
+ end
39
+ def each(&proc)
40
+ @hdb.each(&proc)
41
+ end
42
+ end
43
+ end
@@ -0,0 +1,71 @@
1
+ class Gulp
2
+ class Document
3
+ attr_reader :name, :corpus, :word_count, :phrase_counts
4
+
5
+ def initialize(name, corpus)
6
+ @name = name
7
+ @corpus = corpus
8
+ @word_count = 0
9
+ @finalized = false
10
+ @phrase_counts = {}#Gulp::DataStore.new('document')
11
+ @extractor = Gulp::PhraseExtractor.new
12
+ end
13
+
14
+ def process!
15
+ extractor = XMLTextExtractor.new(self)
16
+ Nokogiri::XML::SAX::Parser.new(extractor).parse(File.open(name))
17
+ self
18
+ end
19
+
20
+ def already_processed?
21
+ @corpus.already_processed?(name)
22
+ end
23
+
24
+ def finalized?
25
+ @finalized
26
+ end
27
+
28
+ def add_to_corpus!
29
+ unless already_processed?
30
+ @finalized = true
31
+ @phrase_counts.each_key do |phrase|
32
+ @corpus.increment_phrase_document_count(phrase)
33
+ end
34
+
35
+ @corpus.mark_as_processed!(name)
36
+ end
37
+ end
38
+
39
+ def add_text(text)
40
+ raise "cannot add text once finalized" if finalized?
41
+ word_count, phrases = @extractor.extract(text)
42
+ @word_count += word_count
43
+
44
+ phrases.each do |phrase|
45
+ @phrase_counts[phrase] ||= 0
46
+ @phrase_counts[phrase] += 1
47
+ end
48
+ end
49
+
50
+ def number_of_unique_phrases
51
+ phrase_counts.size
52
+ end
53
+
54
+ def phrases
55
+ phrase_counts.map do |phrase, count|
56
+ Phrase.new(self, phrase, count)
57
+ end
58
+ end
59
+ end
60
+
61
+ class XMLTextExtractor < Nokogiri::XML::SAX::Document
62
+ def initialize(phrase_extractor)
63
+ super()
64
+ @phrase_extractor = phrase_extractor
65
+ end
66
+
67
+ def characters(text)
68
+ @phrase_extractor.add_text(text)
69
+ end
70
+ end
71
+ end
@@ -0,0 +1,34 @@
1
+ class Gulp
2
+ class Phrase
3
+ attr_accessor :document, :string, :count
4
+ def initialize(document, string, count)
5
+ @document = document
6
+ @string = string
7
+ @count = count
8
+ end
9
+
10
+ def words
11
+ words = string.split(/ /)
12
+ end
13
+
14
+ def phrase_size
15
+ words.size
16
+ end
17
+
18
+ def term_frequency
19
+ (count * phrase_size) / document.word_count.to_f
20
+ end
21
+
22
+ def number_of_documents_with_term
23
+ document.corpus.phrase_document_count(string)
24
+ end
25
+
26
+ def inverse_document_frequency
27
+ Math.log(document.corpus.total_number_of_documents / (1+number_of_documents_with_term))
28
+ end
29
+
30
+ def score
31
+ term_frequency * inverse_document_frequency
32
+ end
33
+ end
34
+ end
@@ -0,0 +1,49 @@
1
+ class Gulp
2
+ class PhraseExtractor
3
+ ALLOWED_PHRASE_LENGTHS = [2,3,4]
4
+ STOPWORDS = %w(a an and except from has in into is made of one that the these this to with)
5
+
6
+ def extract(text)
7
+ strings = chunk_text(preprocess_text(text))
8
+ phrases = []
9
+ word_count = 0
10
+ strings.each do |string|
11
+ words = string.split(/\s+/)
12
+ word_count += words.size
13
+
14
+ next if words.size == 0
15
+
16
+ ALLOWED_PHRASE_LENGTHS.each do |length|
17
+ final_start_position = words.size - length
18
+ (0..final_start_position).each do |start_position|
19
+ sub_phrase_words = words.slice(start_position, length)
20
+
21
+ next if STOPWORDS.include?(sub_phrase_words.first.downcase) || STOPWORDS.include?(sub_phrase_words.last.downcase)
22
+
23
+ phrases << sub_phrase_words.join(' ')
24
+ end
25
+ end
26
+ end
27
+ return [word_count, phrases]
28
+ end
29
+
30
+ private
31
+ def preprocess_text(text)
32
+ # remove parentheticals
33
+ text.gsub!(/\(.+?\)/, ' ')
34
+ text.gsub!(/\[.+?\]/, ' ')
35
+ text.gsub!(/\{.+?\}/, ' ')
36
+
37
+ text
38
+ end
39
+
40
+ def postprocess_text(text)
41
+ text.gsub!(/[^ a-zA-Z0-9-]/,'')
42
+ text
43
+ end
44
+
45
+ def chunk_text(text)
46
+ text.split(/\.|,|:|;|\|/).compact.map{|s| s.gsub(/^\s+|\s+$/,'').gsub(/\s+/, ' ')}.reject{|s| s =~ /^\s*$/}
47
+ end
48
+ end
49
+ end
@@ -0,0 +1,45 @@
1
+ require 'test_helper'
2
+
3
+ class PhraseExtractorTest < Test::Unit::TestCase
4
+ def setup
5
+ @extractor = Gulp::PhraseExtractor.new
6
+ end
7
+
8
+ def phrases_for(text)
9
+ word_count, phrases = @extractor.extract(text)
10
+ phrases
11
+ end
12
+
13
+ should "chunk phrases combinatorially" do
14
+ assert_equal(["y z"], phrases_for("y z"))
15
+ assert_equal(["x y", "y z", "x y z"], phrases_for("x y z"))
16
+ assert_equal(["w x", "x y", "y z", "w x y", "x y z", "w x y z"], phrases_for("w x y z"))
17
+ end
18
+
19
+ should "skip phrases starting with a stopword" do
20
+ assert_equal([], phrases_for("the cow"))
21
+ assert_equal(["cow jumped"], phrases_for("the cow jumped"))
22
+ end
23
+
24
+ should "skip phrases ending with a stopword" do
25
+ assert_equal([], phrases_for("cow of"))
26
+ assert_equal(["fancy cow"], phrases_for("fancy cow of"))
27
+ end
28
+
29
+ should "split phrases on punctuation" do
30
+ punctuation_chars = %w(. , ; : |)
31
+ punctuation_chars.each do |char|
32
+ assert_equal ["w x", "y z"], phrases_for("w x#{char} y z")
33
+ end
34
+ end
35
+
36
+ should "normalize whitespace" do
37
+ assert_equal ["y z"], phrases_for("y z ")
38
+ assert_equal ["y z"], phrases_for(" y z")
39
+ assert_equal ["y z"], phrases_for(" y z ")
40
+ end
41
+
42
+ should "remove parentheticals first" do
43
+ assert_equal ["y z"], phrases_for("y (alpha beta) z")
44
+ end
45
+ end
@@ -0,0 +1,13 @@
1
+ require 'rubygems'
2
+ require 'test/unit'
3
+ require 'shoulda'
4
+
5
+ # TODO: remove this
6
+ I_KNOW_I_AM_USING_AN_OLD_AND_BUGGY_VERSION_OF_LIBXML2 = 1
7
+
8
+ $LOAD_PATH.unshift(File.join(File.dirname(__FILE__), '..', 'lib'))
9
+ $LOAD_PATH.unshift(File.dirname(__FILE__))
10
+ require 'gulp'
11
+
12
+ class Test::Unit::TestCase
13
+ end
metadata ADDED
@@ -0,0 +1,113 @@
1
+ --- !ruby/object:Gem::Specification
2
+ name: gulp
3
+ version: !ruby/object:Gem::Version
4
+ version: 0.1.0
5
+ platform: ruby
6
+ authors:
7
+ - Andrew Carpenter
8
+ autorequire:
9
+ bindir: bin
10
+ cert_chain: []
11
+
12
+ date: 2010-01-23 00:00:00 -08:00
13
+ default_executable: gulp
14
+ dependencies:
15
+ - !ruby/object:Gem::Dependency
16
+ name: activesupport
17
+ type: :runtime
18
+ version_requirement:
19
+ version_requirements: !ruby/object:Gem::Requirement
20
+ requirements:
21
+ - - ">="
22
+ - !ruby/object:Gem::Version
23
+ version: "0"
24
+ version:
25
+ - !ruby/object:Gem::Dependency
26
+ name: tokyocabinet
27
+ type: :runtime
28
+ version_requirement:
29
+ version_requirements: !ruby/object:Gem::Requirement
30
+ requirements:
31
+ - - ">="
32
+ - !ruby/object:Gem::Version
33
+ version: "0"
34
+ version:
35
+ - !ruby/object:Gem::Dependency
36
+ name: nokogiri
37
+ type: :runtime
38
+ version_requirement:
39
+ version_requirements: !ruby/object:Gem::Requirement
40
+ requirements:
41
+ - - ">="
42
+ - !ruby/object:Gem::Version
43
+ version: "0"
44
+ version:
45
+ - !ruby/object:Gem::Dependency
46
+ name: trollop
47
+ type: :runtime
48
+ version_requirement:
49
+ version_requirements: !ruby/object:Gem::Requirement
50
+ requirements:
51
+ - - ">="
52
+ - !ruby/object:Gem::Version
53
+ version: "0"
54
+ version:
55
+ description:
56
+ email: andrew.main@gmail.com
57
+ executables:
58
+ - gulp
59
+ extensions: []
60
+
61
+ extra_rdoc_files:
62
+ - LICENSE
63
+ - README.markdown
64
+ - TODO
65
+ files:
66
+ - .document
67
+ - .gitignore
68
+ - CHANGELOG
69
+ - LICENSE
70
+ - README.markdown
71
+ - Rakefile
72
+ - TODO
73
+ - VERSION
74
+ - bin/gulp
75
+ - lib/gulp.rb
76
+ - lib/gulp/corpus.rb
77
+ - lib/gulp/data_store.rb
78
+ - lib/gulp/document.rb
79
+ - lib/gulp/phrase.rb
80
+ - lib/gulp/phrase_extractor.rb
81
+ - test/phrase_extractor_test.rb
82
+ - test/test_helper.rb
83
+ has_rdoc: true
84
+ homepage: http://github.com/andrewcarpenter/gulp
85
+ licenses: []
86
+
87
+ post_install_message:
88
+ rdoc_options:
89
+ - --charset=UTF-8
90
+ require_paths:
91
+ - lib
92
+ required_ruby_version: !ruby/object:Gem::Requirement
93
+ requirements:
94
+ - - ">="
95
+ - !ruby/object:Gem::Version
96
+ version: "0"
97
+ version:
98
+ required_rubygems_version: !ruby/object:Gem::Requirement
99
+ requirements:
100
+ - - ">="
101
+ - !ruby/object:Gem::Version
102
+ version: "0"
103
+ version:
104
+ requirements: []
105
+
106
+ rubyforge_project:
107
+ rubygems_version: 1.3.5
108
+ signing_key:
109
+ specification_version: 3
110
+ summary: Identify Statistically Improbable Phrases (SIPs)
111
+ test_files:
112
+ - test/phrase_extractor_test.rb
113
+ - test/test_helper.rb