gulp 0.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,5 @@
1
+ README.rdoc
2
+ lib/**/*.rb
3
+ bin/*
4
+ features/**/*.feature
5
+ LICENSE
@@ -0,0 +1,8 @@
1
+ *.sw?
2
+ .DS_Store
3
+ coverage
4
+ rdoc
5
+ pkg
6
+ doc
7
+ tmp
8
+ *.hdb
@@ -0,0 +1 @@
1
+ v0.0.1. Initial release
data/LICENSE ADDED
@@ -0,0 +1,20 @@
1
+ Copyright (c) 2010 Andrew Carpenter
2
+
3
+ Permission is hereby granted, free of charge, to any person obtaining
4
+ a copy of this software and associated documentation files (the
5
+ "Software"), to deal in the Software without restriction, including
6
+ without limitation the rights to use, copy, modify, merge, publish,
7
+ distribute, sublicense, and/or sell copies of the Software, and to
8
+ permit persons to whom the Software is furnished to do so, subject to
9
+ the following conditions:
10
+
11
+ The above copyright notice and this permission notice shall be
12
+ included in all copies or substantial portions of the Software.
13
+
14
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
15
+ EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
16
+ MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
17
+ NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
18
+ LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
19
+ OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
20
+ WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
@@ -0,0 +1,25 @@
1
+ # Gulp
2
+
3
+ Ruby gem for identifying Statistically Improbable Phrases across a large document set.
4
+
5
+ This is pre-alpha; use at your own risk. API will change.
6
+
7
+ ## Install
8
+
9
+ [sudo] gem install gulp
10
+
11
+ ## Usage
12
+
13
+ gulp = Gulp.new(:database_directory => '/path/to/dir')
14
+
15
+ gulp.new_from_xml_file(path_1).process!
16
+ gulp.new_from_xml_file(path_2).process!
17
+ gulp.new_from_xml_file(path_3).process!
18
+
19
+ doc = gulp.new_from_xml_file(path_4).process!
20
+ doc.process!
21
+ doc.phrases # => [<Gulp::Phrase>, <Gulp::Phrase>]
22
+
23
+ ## Copyright
24
+
25
+ Copyright (c) 2010 Andrew Carpenter. See LICENSE for details.
@@ -0,0 +1,59 @@
1
+ require 'rubygems'
2
+ require 'rake'
3
+
4
+ begin
5
+ require 'jeweler'
6
+ Jeweler::Tasks.new do |gem|
7
+ gem.name = "gulp"
8
+ gem.summary = %Q{Identify Statistically Improbable Phrases (SIPs)}
9
+ gem.email = "andrew.main@gmail.com"
10
+ gem.homepage = "http://github.com/andrewcarpenter/gulp"
11
+ gem.authors = ["Andrew Carpenter"]
12
+ gem.add_dependency 'activesupport'
13
+ gem.add_dependency 'tokyocabinet'
14
+ gem.add_dependency 'nokogiri'
15
+ gem.add_dependency 'trollop'
16
+ end
17
+ Jeweler::GemcutterTasks.new
18
+ rescue LoadError
19
+ puts "Jeweler (or a dependency) not available. Install it with: sudo gem install jeweler"
20
+ end
21
+
22
+ require 'rake/testtask'
23
+ Rake::TestTask.new(:test) do |test|
24
+ test.libs << 'lib' << 'test'
25
+ test.pattern = 'test/**/*_test.rb'
26
+ test.verbose = true
27
+ end
28
+
29
+ begin
30
+ require 'rcov/rcovtask'
31
+ Rcov::RcovTask.new do |test|
32
+ test.libs << 'test'
33
+ test.pattern = 'test/**/*_test.rb'
34
+ test.verbose = true
35
+ end
36
+ rescue LoadError
37
+ task :rcov do
38
+ abort "RCov is not available. In order to run rcov, you must: sudo gem install spicycode-rcov"
39
+ end
40
+ end
41
+
42
+
43
+ task :default => :test
44
+
45
+ require 'rake/rdoctask'
46
+ Rake::RDocTask.new do |rdoc|
47
+ if File.exist?('VERSION.yml')
48
+ config = YAML.load(File.read('VERSION.yml'))
49
+ version = "#{config[:major]}.#{config[:minor]}.#{config[:patch]}"
50
+ else
51
+ version = ""
52
+ end
53
+
54
+ rdoc.rdoc_dir = 'rdoc'
55
+ rdoc.title = "gulp #{version}"
56
+ rdoc.rdoc_files.include('README*')
57
+ rdoc.rdoc_files.include('lib/**/*.rb')
58
+ end
59
+
data/TODO ADDED
@@ -0,0 +1,3 @@
1
+ * Port to mongo
2
+ * Use data store for individual document's phrase_counts
3
+ * gemify
data/VERSION ADDED
@@ -0,0 +1 @@
1
+ 0.1.0
@@ -0,0 +1,94 @@
1
+ #!/usr/bin/env ruby
2
+
3
+ # TODO: remove this
4
+ $LOAD_PATH.unshift(File.expand_path(File.dirname(__FILE__)) + '/../lib')
5
+
6
+ # TODO: remove this
7
+ I_KNOW_I_AM_USING_AN_OLD_AND_BUGGY_VERSION_OF_LIBXML2 = 1
8
+
9
+ require 'rubygems'
10
+ require 'trollop'
11
+ require 'gulp'
12
+
13
+ interrupted = false
14
+ trap("INT") do
15
+ puts "waiting so can exit cleanly..."
16
+ @interrupted = true
17
+ end
18
+
19
+ def okay_to_terminate!
20
+ if @interrupted
21
+ puts "exited cleanly."
22
+ exit
23
+ end
24
+ end
25
+
26
+ SUB_COMMANDS = %w(add sip corpus_dump)
27
+ global_opts = Trollop::options do
28
+ banner "statistically improbable phrase extractor"
29
+ opt :database_directory, "Database directory", :type => String
30
+ opt :quiet, "Quiet", :short => :q
31
+ stop_on SUB_COMMANDS
32
+ end
33
+
34
+ cmd = ARGV.shift # get the subcommand
35
+ cmd_opts = case cmd
36
+ when "add"
37
+ Trollop::options do
38
+ opt :file, "File", :type => String
39
+ end
40
+ when "sip"
41
+ Trollop::options do
42
+ opt :file, "File", :type => String, :required => true
43
+ end
44
+ when "corpus_dump"
45
+ else
46
+ Trollop::die "unknown subcommand #{cmd.inspect}"
47
+ end
48
+
49
+ gulp = Gulp.new(:database_directory => global_opts[:database_directory] || '.')
50
+
51
+ case cmd
52
+ when "add"
53
+ if cmd_opts[:file]
54
+ files = [cmd_opts[:file]]
55
+ else
56
+ files = STDIN.read.split("\n")
57
+ end
58
+
59
+ files.each do |path|
60
+ puts "processing #{path}..."
61
+ doc = gulp.new_from_xml_file(path)
62
+ if doc.already_processed?
63
+ puts "\talready processed."
64
+ else
65
+ doc.process!
66
+ doc.add_to_corpus!
67
+ puts "doc has #{doc.number_of_unique_phrases} unique phrases" unless global_opts[:quiet]
68
+
69
+ if doc.number_of_unique_phrases == 0
70
+ puts "no phrases?!?"
71
+ doc.phrase_counts.each do |p, c|
72
+ puts "#{p} => #{c}"
73
+ end
74
+ end
75
+ puts "corpus has #{gulp.corpus.number_of_unique_phrases} unique phrases" unless global_opts[:quiet]
76
+ puts "corpus has #{gulp.corpus.total_number_of_documents} unique documents" unless global_opts[:quiet]
77
+ end
78
+ okay_to_terminate!
79
+ end
80
+ when "sip"
81
+ doc = gulp.new_from_xml_file(cmd_opts[:file])
82
+ doc.process!
83
+ phrases = doc.phrases
84
+
85
+ phrases.sort_by{|p| p.score}.each do |phrase|
86
+ puts "#{phrase.string} (#{phrase.count})=> #{phrase.score}"
87
+ okay_to_terminate!
88
+ end
89
+ when "corpus_dump"
90
+ gulp.corpus.phrase_document_counts.map do |phrase, count|
91
+ puts "#{phrase} => #{count}"
92
+ okay_to_terminate!
93
+ end
94
+ end
@@ -0,0 +1,23 @@
1
+ require 'nokogiri'
2
+ require 'active_support'
3
+ require 'tokyocabinet'
4
+
5
+ class Gulp
6
+ VERSION = '0.0.1'
7
+
8
+ attr_reader :corpus
9
+ def initialize(options)
10
+ @corpus = Corpus.new(options[:database_directory])
11
+ end
12
+
13
+ def new_from_xml_file(path)
14
+ Gulp::Document.new(path, @corpus)
15
+ end
16
+ end
17
+
18
+ require "gulp/corpus"
19
+ require "gulp/data_store"
20
+ require "gulp/document"
21
+ require "gulp/phrase"
22
+ require "gulp/phrase_extractor"
23
+
@@ -0,0 +1,35 @@
1
+ class Gulp
2
+ class Corpus
3
+
4
+ attr_reader :phrase_document_counts
5
+ def initialize(database_directory)
6
+ @database_directory = database_directory
7
+ @processed_documents = Gulp::DataStore.new("#{@database_directory}/processed_documents")
8
+ @phrase_document_counts = Gulp::DataStore.new("#{@database_directory}/phrase_document_counts")
9
+ end
10
+
11
+ def mark_as_processed!(document_name)
12
+ @processed_documents.increment(document_name)
13
+ end
14
+
15
+ def already_processed?(document_name)
16
+ @processed_documents.has_key?(document_name)
17
+ end
18
+
19
+ def total_number_of_documents
20
+ @processed_documents.size
21
+ end
22
+
23
+ def number_of_unique_phrases
24
+ @phrase_document_counts.size
25
+ end
26
+
27
+ def increment_phrase_document_count(phrase)
28
+ @phrase_document_counts.increment(phrase)
29
+ end
30
+
31
+ def phrase_document_count(phrase)
32
+ @phrase_document_counts[phrase]
33
+ end
34
+ end
35
+ end
@@ -0,0 +1,43 @@
1
+ class Gulp
2
+ class DataStore
3
+ include TokyoCabinet
4
+ include Enumerable
5
+
6
+ def initialize(path)
7
+ @hdb = HDB::new
8
+ @hdb.open(path + '.hdb', HDB::OWRITER | HDB::OCREAT)
9
+ end
10
+
11
+ def increment(key)
12
+ @hdb.addint(key,1)
13
+ end
14
+
15
+ def [](key)
16
+ val = @hdb[key]
17
+ val ? val.unpack('i').first : 0
18
+ end
19
+
20
+ def []=(key, value)
21
+ @hdb[key] = value
22
+ end
23
+
24
+ def has_key?(key)
25
+ @hdb[key].present?
26
+ end
27
+
28
+ def clear!
29
+ @hdb.vanish
30
+ end
31
+
32
+ def size
33
+ @hdb.rnum
34
+ end
35
+
36
+ def each_key(&proc)
37
+ @hdb.each_key(&proc)
38
+ end
39
+ def each(&proc)
40
+ @hdb.each(&proc)
41
+ end
42
+ end
43
+ end
@@ -0,0 +1,71 @@
1
+ class Gulp
2
+ class Document
3
+ attr_reader :name, :corpus, :word_count, :phrase_counts
4
+
5
+ def initialize(name, corpus)
6
+ @name = name
7
+ @corpus = corpus
8
+ @word_count = 0
9
+ @finalized = false
10
+ @phrase_counts = {}#Gulp::DataStore.new('document')
11
+ @extractor = Gulp::PhraseExtractor.new
12
+ end
13
+
14
+ def process!
15
+ extractor = XMLTextExtractor.new(self)
16
+ Nokogiri::XML::SAX::Parser.new(extractor).parse(File.open(name))
17
+ self
18
+ end
19
+
20
+ def already_processed?
21
+ @corpus.already_processed?(name)
22
+ end
23
+
24
+ def finalized?
25
+ @finalized
26
+ end
27
+
28
+ def add_to_corpus!
29
+ unless already_processed?
30
+ @finalized = true
31
+ @phrase_counts.each_key do |phrase|
32
+ @corpus.increment_phrase_document_count(phrase)
33
+ end
34
+
35
+ @corpus.mark_as_processed!(name)
36
+ end
37
+ end
38
+
39
+ def add_text(text)
40
+ raise "cannot add text once finalized" if finalized?
41
+ word_count, phrases = @extractor.extract(text)
42
+ @word_count += word_count
43
+
44
+ phrases.each do |phrase|
45
+ @phrase_counts[phrase] ||= 0
46
+ @phrase_counts[phrase] += 1
47
+ end
48
+ end
49
+
50
+ def number_of_unique_phrases
51
+ phrase_counts.size
52
+ end
53
+
54
+ def phrases
55
+ phrase_counts.map do |phrase, count|
56
+ Phrase.new(self, phrase, count)
57
+ end
58
+ end
59
+ end
60
+
61
+ class XMLTextExtractor < Nokogiri::XML::SAX::Document
62
+ def initialize(phrase_extractor)
63
+ super()
64
+ @phrase_extractor = phrase_extractor
65
+ end
66
+
67
+ def characters(text)
68
+ @phrase_extractor.add_text(text)
69
+ end
70
+ end
71
+ end
@@ -0,0 +1,34 @@
1
+ class Gulp
2
+ class Phrase
3
+ attr_accessor :document, :string, :count
4
+ def initialize(document, string, count)
5
+ @document = document
6
+ @string = string
7
+ @count = count
8
+ end
9
+
10
+ def words
11
+ words = string.split(/ /)
12
+ end
13
+
14
+ def phrase_size
15
+ words.size
16
+ end
17
+
18
+ def term_frequency
19
+ (count * phrase_size) / document.word_count.to_f
20
+ end
21
+
22
+ def number_of_documents_with_term
23
+ document.corpus.phrase_document_count(string)
24
+ end
25
+
26
+ def inverse_document_frequency
27
+ Math.log(document.corpus.total_number_of_documents / (1+number_of_documents_with_term))
28
+ end
29
+
30
+ def score
31
+ term_frequency * inverse_document_frequency
32
+ end
33
+ end
34
+ end
@@ -0,0 +1,49 @@
1
+ class Gulp
2
+ class PhraseExtractor
3
+ ALLOWED_PHRASE_LENGTHS = [2,3,4]
4
+ STOPWORDS = %w(a an and except from has in into is made of one that the these this to with)
5
+
6
+ def extract(text)
7
+ strings = chunk_text(preprocess_text(text))
8
+ phrases = []
9
+ word_count = 0
10
+ strings.each do |string|
11
+ words = string.split(/\s+/)
12
+ word_count += words.size
13
+
14
+ next if words.size == 0
15
+
16
+ ALLOWED_PHRASE_LENGTHS.each do |length|
17
+ final_start_position = words.size - length
18
+ (0..final_start_position).each do |start_position|
19
+ sub_phrase_words = words.slice(start_position, length)
20
+
21
+ next if STOPWORDS.include?(sub_phrase_words.first.downcase) || STOPWORDS.include?(sub_phrase_words.last.downcase)
22
+
23
+ phrases << sub_phrase_words.join(' ')
24
+ end
25
+ end
26
+ end
27
+ return [word_count, phrases]
28
+ end
29
+
30
+ private
31
+ def preprocess_text(text)
32
+ # remove parentheticals
33
+ text.gsub!(/\(.+?\)/, ' ')
34
+ text.gsub!(/\[.+?\]/, ' ')
35
+ text.gsub!(/\{.+?\}/, ' ')
36
+
37
+ text
38
+ end
39
+
40
+ def postprocess_text(text)
41
+ text.gsub!(/[^ a-zA-Z0-9-]/,'')
42
+ text
43
+ end
44
+
45
+ def chunk_text(text)
46
+ text.split(/\.|,|:|;|\|/).compact.map{|s| s.gsub(/^\s+|\s+$/,'').gsub(/\s+/, ' ')}.reject{|s| s =~ /^\s*$/}
47
+ end
48
+ end
49
+ end
@@ -0,0 +1,45 @@
1
+ require 'test_helper'
2
+
3
+ class PhraseExtractorTest < Test::Unit::TestCase
4
+ def setup
5
+ @extractor = Gulp::PhraseExtractor.new
6
+ end
7
+
8
+ def phrases_for(text)
9
+ word_count, phrases = @extractor.extract(text)
10
+ phrases
11
+ end
12
+
13
+ should "chunk phrases combinatorially" do
14
+ assert_equal(["y z"], phrases_for("y z"))
15
+ assert_equal(["x y", "y z", "x y z"], phrases_for("x y z"))
16
+ assert_equal(["w x", "x y", "y z", "w x y", "x y z", "w x y z"], phrases_for("w x y z"))
17
+ end
18
+
19
+ should "skip phrases starting with a stopword" do
20
+ assert_equal([], phrases_for("the cow"))
21
+ assert_equal(["cow jumped"], phrases_for("the cow jumped"))
22
+ end
23
+
24
+ should "skip phrases ending with a stopword" do
25
+ assert_equal([], phrases_for("cow of"))
26
+ assert_equal(["fancy cow"], phrases_for("fancy cow of"))
27
+ end
28
+
29
+ should "split phrases on punctuation" do
30
+ punctuation_chars = %w(. , ; : |)
31
+ punctuation_chars.each do |char|
32
+ assert_equal ["w x", "y z"], phrases_for("w x#{char} y z")
33
+ end
34
+ end
35
+
36
+ should "normalize whitespace" do
37
+ assert_equal ["y z"], phrases_for("y z ")
38
+ assert_equal ["y z"], phrases_for(" y z")
39
+ assert_equal ["y z"], phrases_for(" y z ")
40
+ end
41
+
42
+ should "remove parentheticals first" do
43
+ assert_equal ["y z"], phrases_for("y (alpha beta) z")
44
+ end
45
+ end
@@ -0,0 +1,13 @@
1
+ require 'rubygems'
2
+ require 'test/unit'
3
+ require 'shoulda'
4
+
5
+ # TODO: remove this
6
+ I_KNOW_I_AM_USING_AN_OLD_AND_BUGGY_VERSION_OF_LIBXML2 = 1
7
+
8
+ $LOAD_PATH.unshift(File.join(File.dirname(__FILE__), '..', 'lib'))
9
+ $LOAD_PATH.unshift(File.dirname(__FILE__))
10
+ require 'gulp'
11
+
12
+ class Test::Unit::TestCase
13
+ end
metadata ADDED
@@ -0,0 +1,113 @@
1
+ --- !ruby/object:Gem::Specification
2
+ name: gulp
3
+ version: !ruby/object:Gem::Version
4
+ version: 0.1.0
5
+ platform: ruby
6
+ authors:
7
+ - Andrew Carpenter
8
+ autorequire:
9
+ bindir: bin
10
+ cert_chain: []
11
+
12
+ date: 2010-01-23 00:00:00 -08:00
13
+ default_executable: gulp
14
+ dependencies:
15
+ - !ruby/object:Gem::Dependency
16
+ name: activesupport
17
+ type: :runtime
18
+ version_requirement:
19
+ version_requirements: !ruby/object:Gem::Requirement
20
+ requirements:
21
+ - - ">="
22
+ - !ruby/object:Gem::Version
23
+ version: "0"
24
+ version:
25
+ - !ruby/object:Gem::Dependency
26
+ name: tokyocabinet
27
+ type: :runtime
28
+ version_requirement:
29
+ version_requirements: !ruby/object:Gem::Requirement
30
+ requirements:
31
+ - - ">="
32
+ - !ruby/object:Gem::Version
33
+ version: "0"
34
+ version:
35
+ - !ruby/object:Gem::Dependency
36
+ name: nokogiri
37
+ type: :runtime
38
+ version_requirement:
39
+ version_requirements: !ruby/object:Gem::Requirement
40
+ requirements:
41
+ - - ">="
42
+ - !ruby/object:Gem::Version
43
+ version: "0"
44
+ version:
45
+ - !ruby/object:Gem::Dependency
46
+ name: trollop
47
+ type: :runtime
48
+ version_requirement:
49
+ version_requirements: !ruby/object:Gem::Requirement
50
+ requirements:
51
+ - - ">="
52
+ - !ruby/object:Gem::Version
53
+ version: "0"
54
+ version:
55
+ description:
56
+ email: andrew.main@gmail.com
57
+ executables:
58
+ - gulp
59
+ extensions: []
60
+
61
+ extra_rdoc_files:
62
+ - LICENSE
63
+ - README.markdown
64
+ - TODO
65
+ files:
66
+ - .document
67
+ - .gitignore
68
+ - CHANGELOG
69
+ - LICENSE
70
+ - README.markdown
71
+ - Rakefile
72
+ - TODO
73
+ - VERSION
74
+ - bin/gulp
75
+ - lib/gulp.rb
76
+ - lib/gulp/corpus.rb
77
+ - lib/gulp/data_store.rb
78
+ - lib/gulp/document.rb
79
+ - lib/gulp/phrase.rb
80
+ - lib/gulp/phrase_extractor.rb
81
+ - test/phrase_extractor_test.rb
82
+ - test/test_helper.rb
83
+ has_rdoc: true
84
+ homepage: http://github.com/andrewcarpenter/gulp
85
+ licenses: []
86
+
87
+ post_install_message:
88
+ rdoc_options:
89
+ - --charset=UTF-8
90
+ require_paths:
91
+ - lib
92
+ required_ruby_version: !ruby/object:Gem::Requirement
93
+ requirements:
94
+ - - ">="
95
+ - !ruby/object:Gem::Version
96
+ version: "0"
97
+ version:
98
+ required_rubygems_version: !ruby/object:Gem::Requirement
99
+ requirements:
100
+ - - ">="
101
+ - !ruby/object:Gem::Version
102
+ version: "0"
103
+ version:
104
+ requirements: []
105
+
106
+ rubyforge_project:
107
+ rubygems_version: 1.3.5
108
+ signing_key:
109
+ specification_version: 3
110
+ summary: Identify Statistically Improbable Phrases (SIPs)
111
+ test_files:
112
+ - test/phrase_extractor_test.rb
113
+ - test/test_helper.rb