RubyGems - gulp - Versions diffs - 0.1.0 - Mend

gulp 0.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (18) hide show

data/.document +5 -0
data/.gitignore +8 -0
data/CHANGELOG +1 -0
data/LICENSE +20 -0
data/README.markdown +25 -0
data/Rakefile +59 -0
data/TODO +3 -0
data/VERSION +1 -0
data/bin/gulp +94 -0
data/lib/gulp.rb +23 -0
data/lib/gulp/corpus.rb +35 -0
data/lib/gulp/data_store.rb +43 -0
data/lib/gulp/document.rb +71 -0
data/lib/gulp/phrase.rb +34 -0
data/lib/gulp/phrase_extractor.rb +49 -0
data/test/phrase_extractor_test.rb +45 -0
data/test/test_helper.rb +13 -0
metadata +113 -0

data/.document ADDED

@@ -0,0 +1,5 @@
+README.rdoc
+lib/**/*.rb
+bin/*
+features/**/*.feature
+LICENSE

data/.gitignore ADDED

@@ -0,0 +1,8 @@
+*.sw?
+.DS_Store
+coverage
+rdoc
+pkg
+doc
+tmp
+*.hdb

data/CHANGELOG ADDED

	@@ -0,0 +1 @@
1	+ v0.0.1. Initial release

data/LICENSE ADDED

@@ -0,0 +1,20 @@
+Copyright (c) 2010 Andrew Carpenter
+Permission is hereby granted, free of charge, to any person obtaining
+a copy of this software and associated documentation files (the
+"Software"), to deal in the Software without restriction, including
+without limitation the rights to use, copy, modify, merge, publish,
+distribute, sublicense, and/or sell copies of the Software, and to
+permit persons to whom the Software is furnished to do so, subject to
+the following conditions:
+The above copyright notice and this permission notice shall be
+included in all copies or substantial portions of the Software.
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
+LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
+OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
+WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.

data/README.markdown ADDED

@@ -0,0 +1,25 @@
+# Gulp
+Ruby gem for identifying Statistically Improbable Phrases across a large document set.
+This is pre-alpha; use at your own risk.  API will change.
+## Install
+    [sudo] gem install gulp
+## Usage
+    gulp = Gulp.new(:database_directory => '/path/to/dir')
+    gulp.new_from_xml_file(path_1).process!
+    gulp.new_from_xml_file(path_2).process!
+    gulp.new_from_xml_file(path_3).process!
+    doc = gulp.new_from_xml_file(path_4).process!
+    doc.process!
+    doc.phrases # => [<Gulp::Phrase>, <Gulp::Phrase>]
+## Copyright
+Copyright (c) 2010 Andrew Carpenter. See LICENSE for details.

data/Rakefile ADDED

@@ -0,0 +1,59 @@
+require 'rubygems'
+require 'rake'
+begin
+  require 'jeweler'
+  Jeweler::Tasks.new do |gem|
+    gem.name = "gulp"
+    gem.summary = %Q{Identify Statistically Improbable Phrases (SIPs)}
+    gem.email = "andrew.main@gmail.com"
+    gem.homepage = "http://github.com/andrewcarpenter/gulp"
+    gem.authors = ["Andrew Carpenter"]
+    gem.add_dependency 'activesupport'
+    gem.add_dependency 'tokyocabinet'
+    gem.add_dependency 'nokogiri'
+    gem.add_dependency 'trollop'
+  end
+  Jeweler::GemcutterTasks.new
+rescue LoadError
+  puts "Jeweler (or a dependency) not available. Install it with: sudo gem install jeweler"
+end
+require 'rake/testtask'
+Rake::TestTask.new(:test) do |test|
+  test.libs << 'lib' << 'test'
+  test.pattern = 'test/**/*_test.rb'
+  test.verbose = true
+end
+begin
+  require 'rcov/rcovtask'
+  Rcov::RcovTask.new do |test|
+    test.libs << 'test'
+    test.pattern = 'test/**/*_test.rb'
+    test.verbose = true
+  end
+rescue LoadError
+  task :rcov do
+    abort "RCov is not available. In order to run rcov, you must: sudo gem install spicycode-rcov"
+  end
+end
+task :default => :test
+require 'rake/rdoctask'
+Rake::RDocTask.new do |rdoc|
+  if File.exist?('VERSION.yml')
+    config = YAML.load(File.read('VERSION.yml'))
+    version = "#{config[:major]}.#{config[:minor]}.#{config[:patch]}"
+  else
+    version = ""
+  end
+  rdoc.rdoc_dir = 'rdoc'
+  rdoc.title = "gulp #{version}"
+  rdoc.rdoc_files.include('README*')
+  rdoc.rdoc_files.include('lib/**/*.rb')
+end

data/TODO ADDED

@@ -0,0 +1,3 @@
+* Port to mongo
+* Use data store for individual document's phrase_counts
+* gemify

data/VERSION ADDED

	@@ -0,0 +1 @@
1	+ 0.1.0

data/bin/gulp ADDED

@@ -0,0 +1,94 @@
+#!/usr/bin/env ruby
+# TODO: remove this
+$LOAD_PATH.unshift(File.expand_path(File.dirname(__FILE__)) + '/../lib')
+# TODO: remove this
+I_KNOW_I_AM_USING_AN_OLD_AND_BUGGY_VERSION_OF_LIBXML2 = 1
+require 'rubygems'
+require 'trollop'
+require 'gulp'
+interrupted = false
+trap("INT") do
+  puts "waiting so can exit cleanly..."
+  @interrupted = true
+end
+def okay_to_terminate!
+  if @interrupted
+    puts "exited cleanly."
+    exit
+  end
+end
+SUB_COMMANDS = %w(add sip corpus_dump)
+global_opts = Trollop::options do
+  banner "statistically improbable phrase extractor"
+  opt :database_directory, "Database directory", :type => String
+  opt :quiet, "Quiet", :short => :q
+  stop_on SUB_COMMANDS
+end
+cmd = ARGV.shift # get the subcommand
+cmd_opts = case cmd
+  when "add"
+    Trollop::options do
+      opt :file, "File", :type => String
+    end
+  when "sip"
+    Trollop::options do
+      opt :file, "File", :type => String, :required => true
+    end
+  when "corpus_dump"
+  else
+    Trollop::die "unknown subcommand #{cmd.inspect}"
+  end
+gulp = Gulp.new(:database_directory => global_opts[:database_directory] || '.')
+case cmd
+when "add"
+  if cmd_opts[:file]
+    files = [cmd_opts[:file]]
+  else
+    files = STDIN.read.split("\n")
+  end
+  files.each do |path|
+    puts "processing #{path}..."
+    doc = gulp.new_from_xml_file(path)
+    if doc.already_processed?
+      puts "\talready processed."
+    else
+      doc.process!
+      doc.add_to_corpus!
+      puts "doc has #{doc.number_of_unique_phrases} unique phrases" unless global_opts[:quiet]
+      if doc.number_of_unique_phrases == 0
+        puts "no phrases?!?"
+        doc.phrase_counts.each do |p, c|
+          puts "#{p} => #{c}"
+        end
+      end
+      puts "corpus has #{gulp.corpus.number_of_unique_phrases} unique phrases" unless global_opts[:quiet]
+      puts "corpus has #{gulp.corpus.total_number_of_documents} unique documents" unless global_opts[:quiet]
+    end
+    okay_to_terminate!
+  end
+when "sip"
+  doc = gulp.new_from_xml_file(cmd_opts[:file])
+  doc.process!
+  phrases = doc.phrases
+  phrases.sort_by{|p| p.score}.each do |phrase|
+    puts "#{phrase.string} (#{phrase.count})=> #{phrase.score}"
+    okay_to_terminate!
+  end
+when "corpus_dump"
+  gulp.corpus.phrase_document_counts.map do |phrase, count|
+    puts "#{phrase} => #{count}"
+    okay_to_terminate!
+  end
+end

data/lib/gulp.rb ADDED

@@ -0,0 +1,23 @@
+require 'nokogiri'
+require 'active_support'
+require 'tokyocabinet'
+class Gulp
+  VERSION = '0.0.1'
+  attr_reader :corpus
+  def initialize(options)
+    @corpus = Corpus.new(options[:database_directory])
+  end
+  def new_from_xml_file(path)
+    Gulp::Document.new(path, @corpus)
+  end
+end
+require "gulp/corpus"
+require "gulp/data_store"
+require "gulp/document"
+require "gulp/phrase"
+require "gulp/phrase_extractor"

data/lib/gulp/corpus.rb ADDED

@@ -0,0 +1,35 @@
+class Gulp
+  class Corpus
+    attr_reader :phrase_document_counts
+    def initialize(database_directory)
+      @database_directory = database_directory
+      @processed_documents = Gulp::DataStore.new("#{@database_directory}/processed_documents")
+      @phrase_document_counts = Gulp::DataStore.new("#{@database_directory}/phrase_document_counts")
+    end
+    def mark_as_processed!(document_name)
+      @processed_documents.increment(document_name)
+    end
+    def already_processed?(document_name)
+      @processed_documents.has_key?(document_name)
+    end
+    def total_number_of_documents
+      @processed_documents.size
+    end
+    def number_of_unique_phrases
+      @phrase_document_counts.size
+    end
+    def increment_phrase_document_count(phrase)
+      @phrase_document_counts.increment(phrase)
+    end
+    def phrase_document_count(phrase)
+      @phrase_document_counts[phrase]
+    end
+  end
+end

data/lib/gulp/data_store.rb ADDED

@@ -0,0 +1,43 @@
+class Gulp
+  class DataStore
+    include TokyoCabinet
+    include Enumerable
+    def initialize(path)
+      @hdb = HDB::new
+      @hdb.open(path + '.hdb', HDB::OWRITER | HDB::OCREAT)
+    end
+    def increment(key)
+      @hdb.addint(key,1)
+    end
+    def [](key)
+      val = @hdb[key]
+      val ? val.unpack('i').first : 0
+    end
+    def []=(key, value)
+      @hdb[key] = value
+    end
+    def has_key?(key)
+      @hdb[key].present?
+    end
+    def clear!
+      @hdb.vanish
+    end
+    def size
+      @hdb.rnum
+    end
+    def each_key(&proc)
+      @hdb.each_key(&proc)
+    end
+    def each(&proc)
+      @hdb.each(&proc)
+    end
+  end
+end

data/lib/gulp/document.rb ADDED

@@ -0,0 +1,71 @@
+class Gulp
+  class Document
+    attr_reader :name, :corpus, :word_count, :phrase_counts
+    def initialize(name, corpus)
+      @name = name
+      @corpus = corpus
+      @word_count = 0
+      @finalized = false
+      @phrase_counts = {}#Gulp::DataStore.new('document')
+      @extractor = Gulp::PhraseExtractor.new
+    end
+    def process!
+      extractor = XMLTextExtractor.new(self)
+      Nokogiri::XML::SAX::Parser.new(extractor).parse(File.open(name))
+      self
+    end
+    def already_processed?
+      @corpus.already_processed?(name)
+    end
+    def finalized?
+      @finalized
+    end
+    def add_to_corpus!
+      unless already_processed?
+        @finalized = true
+        @phrase_counts.each_key do |phrase|
+          @corpus.increment_phrase_document_count(phrase)
+        end
+        @corpus.mark_as_processed!(name)
+      end
+    end
+    def add_text(text)
+      raise "cannot add text once finalized" if finalized?
+      word_count, phrases = @extractor.extract(text)
+      @word_count += word_count
+      phrases.each do |phrase|
+        @phrase_counts[phrase] ||= 0
+        @phrase_counts[phrase] += 1
+      end
+    end
+    def number_of_unique_phrases
+      phrase_counts.size
+    end
+    def phrases
+      phrase_counts.map do |phrase, count|
+        Phrase.new(self, phrase, count)
+      end
+    end
+  end
+  class XMLTextExtractor < Nokogiri::XML::SAX::Document
+    def initialize(phrase_extractor)
+      super()
+      @phrase_extractor = phrase_extractor
+    end
+    def characters(text)
+      @phrase_extractor.add_text(text)
+    end
+  end
+end

data/lib/gulp/phrase.rb ADDED

@@ -0,0 +1,34 @@
+class Gulp
+  class Phrase
+    attr_accessor :document, :string, :count
+    def initialize(document, string, count)
+      @document = document
+      @string = string
+      @count = count
+    end
+    def words
+      words = string.split(/ /)
+    end
+    def phrase_size
+      words.size
+    end
+    def term_frequency
+      (count * phrase_size) / document.word_count.to_f
+    end
+    def number_of_documents_with_term
+      document.corpus.phrase_document_count(string)
+    end
+    def inverse_document_frequency
+      Math.log(document.corpus.total_number_of_documents / (1+number_of_documents_with_term))
+    end
+    def score
+      term_frequency * inverse_document_frequency
+    end
+  end
+end

data/lib/gulp/phrase_extractor.rb ADDED

@@ -0,0 +1,49 @@
+class Gulp
+  class PhraseExtractor
+    ALLOWED_PHRASE_LENGTHS = [2,3,4]
+    STOPWORDS = %w(a an and except from has in into is made of one that the these this to with)
+    def extract(text)
+      strings = chunk_text(preprocess_text(text))
+      phrases = []
+      word_count = 0
+      strings.each do |string|
+        words = string.split(/\s+/)
+        word_count += words.size
+        next if words.size == 0
+        ALLOWED_PHRASE_LENGTHS.each do |length|
+          final_start_position = words.size - length
+          (0..final_start_position).each do |start_position|
+            sub_phrase_words = words.slice(start_position, length)
+            next if STOPWORDS.include?(sub_phrase_words.first.downcase) || STOPWORDS.include?(sub_phrase_words.last.downcase)
+            phrases << sub_phrase_words.join(' ')
+          end
+        end
+      end
+      return [word_count, phrases]
+    end
+    private
+    def preprocess_text(text)
+      # remove parentheticals
+      text.gsub!(/\(.+?\)/, ' ')
+      text.gsub!(/\[.+?\]/, ' ')
+      text.gsub!(/\{.+?\}/, ' ')
+      text
+    end
+    def postprocess_text(text)
+      text.gsub!(/[^ a-zA-Z0-9-]/,'')
+      text
+    end
+    def chunk_text(text)
+      text.split(/\.|,|:|;|\|/).compact.map{|s| s.gsub(/^\s+|\s+$/,'').gsub(/\s+/, ' ')}.reject{|s| s =~ /^\s*$/}
+    end
+  end
+end

data/test/phrase_extractor_test.rb ADDED

@@ -0,0 +1,45 @@
+require 'test_helper'
+class PhraseExtractorTest < Test::Unit::TestCase
+  def setup
+    @extractor = Gulp::PhraseExtractor.new
+  end
+  def phrases_for(text)
+    word_count, phrases = @extractor.extract(text)
+    phrases
+  end
+  should "chunk phrases combinatorially" do
+    assert_equal(["y z"], phrases_for("y z"))
+    assert_equal(["x y", "y z", "x y z"], phrases_for("x y z"))
+    assert_equal(["w x", "x y", "y z", "w x y", "x y z", "w x y z"], phrases_for("w x y z"))
+  end
+  should "skip phrases starting with a stopword" do
+    assert_equal([], phrases_for("the cow"))
+    assert_equal(["cow jumped"], phrases_for("the cow jumped"))
+  end
+  should "skip phrases ending with a stopword" do
+    assert_equal([], phrases_for("cow of"))
+    assert_equal(["fancy cow"], phrases_for("fancy cow of"))
+  end
+  should "split phrases on punctuation" do
+    punctuation_chars = %w(. , ; : |)
+    punctuation_chars.each do |char|
+      assert_equal ["w x", "y z"], phrases_for("w x#{char} y z")
+    end
+  end
+  should "normalize whitespace" do
+    assert_equal ["y z"], phrases_for("y   z   ")
+    assert_equal ["y z"], phrases_for("    y   z")
+    assert_equal ["y z"], phrases_for("    y   z   ")
+  end
+  should "remove parentheticals first" do
+    assert_equal ["y z"], phrases_for("y (alpha beta) z")
+  end
+end

data/test/test_helper.rb ADDED

@@ -0,0 +1,13 @@
+require 'rubygems'
+require 'test/unit'
+require 'shoulda'
+# TODO: remove this
+I_KNOW_I_AM_USING_AN_OLD_AND_BUGGY_VERSION_OF_LIBXML2 = 1
+$LOAD_PATH.unshift(File.join(File.dirname(__FILE__), '..', 'lib'))
+$LOAD_PATH.unshift(File.dirname(__FILE__))
+require 'gulp'
+class Test::Unit::TestCase
+end

metadata ADDED

@@ -0,0 +1,113 @@
+--- !ruby/object:Gem::Specification
+name: gulp
+version: !ruby/object:Gem::Version
+  version: 0.1.0
+platform: ruby
+authors:
+- Andrew Carpenter
+autorequire:
+bindir: bin
+cert_chain: []
+date: 2010-01-23 00:00:00 -08:00
+default_executable: gulp
+dependencies:
+- !ruby/object:Gem::Dependency
+  name: activesupport
+  type: :runtime
+  version_requirement:
+  version_requirements: !ruby/object:Gem::Requirement
+    requirements:
+    - - ">="
+      - !ruby/object:Gem::Version
+        version: "0"
+    version:
+- !ruby/object:Gem::Dependency
+  name: tokyocabinet
+  type: :runtime
+  version_requirement:
+  version_requirements: !ruby/object:Gem::Requirement
+    requirements:
+    - - ">="
+      - !ruby/object:Gem::Version
+        version: "0"
+    version:
+- !ruby/object:Gem::Dependency
+  name: nokogiri
+  type: :runtime
+  version_requirement:
+  version_requirements: !ruby/object:Gem::Requirement
+    requirements:
+    - - ">="
+      - !ruby/object:Gem::Version
+        version: "0"
+    version:
+- !ruby/object:Gem::Dependency
+  name: trollop
+  type: :runtime
+  version_requirement:
+  version_requirements: !ruby/object:Gem::Requirement
+    requirements:
+    - - ">="
+      - !ruby/object:Gem::Version
+        version: "0"
+    version:
+description:
+email: andrew.main@gmail.com
+executables:
+- gulp
+extensions: []
+extra_rdoc_files:
+- LICENSE
+- README.markdown
+- TODO
+files:
+- .document
+- .gitignore
+- CHANGELOG
+- LICENSE
+- README.markdown
+- Rakefile
+- TODO
+- VERSION
+- bin/gulp
+- lib/gulp.rb
+- lib/gulp/corpus.rb
+- lib/gulp/data_store.rb
+- lib/gulp/document.rb
+- lib/gulp/phrase.rb
+- lib/gulp/phrase_extractor.rb
+- test/phrase_extractor_test.rb
+- test/test_helper.rb
+has_rdoc: true
+homepage: http://github.com/andrewcarpenter/gulp
+licenses: []
+post_install_message:
+rdoc_options:
+- --charset=UTF-8
+require_paths:
+- lib
+required_ruby_version: !ruby/object:Gem::Requirement
+  requirements:
+  - - ">="
+    - !ruby/object:Gem::Version
+      version: "0"
+  version:
+required_rubygems_version: !ruby/object:Gem::Requirement
+  requirements:
+  - - ">="
+    - !ruby/object:Gem::Version
+      version: "0"
+  version:
+requirements: []
+rubyforge_project:
+rubygems_version: 1.3.5
+signing_key:
+specification_version: 3
+summary: Identify Statistically Improbable Phrases (SIPs)
+test_files:
+- test/phrase_extractor_test.rb
+- test/test_helper.rb